1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import datetime 6import logging 7import time 8import warnings 9 10import common 11 12from autotest_lib.client.common_lib import base_job 13from autotest_lib.client.common_lib import error 14from autotest_lib.client.common_lib import priorities 15from autotest_lib.client.common_lib import time_utils 16from autotest_lib.client.common_lib import utils 17from autotest_lib.client.common_lib.cros import dev_server 18from autotest_lib.server.cros import provision 19from autotest_lib.server.cros.dynamic_suite import constants 20from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 21from autotest_lib.server.cros.dynamic_suite.suite import ProvisionSuite 22from autotest_lib.server.cros.dynamic_suite.suite import Suite 23from autotest_lib.tko import utils as tko_utils 24 25 26"""CrOS dynamic test suite generation and execution module. 27 28This module implements runtime-generated test suites for CrOS. 29Design doc: http://goto.google.com/suitesv2 30 31Individual tests can declare themselves as a part of one or more 32suites, and the code here enables control files to be written 33that can refer to these "dynamic suites" by name. We also provide 34support for reimaging devices with a given build and running a 35dynamic suite across all reimaged devices. 36 37The public API for defining a suite includes one method: reimage_and_run(). 38A suite control file can be written by importing this module and making 39an appropriate call to this single method. In normal usage, this control 40file will be run in a 'hostless' server-side autotest job, scheduling 41sub-jobs to do the needed reimaging and test running. 42 43Example control file: 44 45import common 46from autotest_lib.server.cros import provision 47from autotest_lib.server.cros.dynamic_suite import dynamic_suite 48 49dynamic_suite.reimage_and_run( 50 builds={provision.CROS_VERSION_PREFIX: build}, board=board, name='bvt', 51 job=job, pool=pool, check_hosts=check_hosts, add_experimental=True, 52 devserver_url=devserver_url) 53 54This will -- at runtime -- find all control files that contain "bvt" in their 55"SUITE=" clause, schedule jobs to reimage devices in the 56specified pool of the specified board with the specified build and, upon 57completion of those jobs, schedule and wait for jobs that run all the tests it 58discovered. 59 60Suites can be run by using the atest command-line tool: 61 atest suite create -b <board> -i <build/name> <suite> 62e.g. 63 atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt 64 65------------------------------------------------------------------------- 66Implementation details 67 68A Suite instance represents a single test suite, defined by some predicate 69run over all known control files. The simplest example is creating a Suite 70by 'name'. 71 72create_suite_job() takes the parameters needed to define a suite run (board, 73build to test, machine pool, and which suite to run), ensures important 74preconditions are met, finds the appropraite suite control file, and then 75schedules the hostless job that will do the rest of the work. 76 77Note that we have more than one Dev server in our test lab architecture. 78We currently load balance per-build being tested, so one and only one dev 79server is used by any given run through the reimaging/testing flow. 80 81- create_suite_job() 82The primary role of create_suite_job() is to ensure that the required 83artifacts for the build to be tested are staged on the dev server. This 84includes payloads required to autoupdate machines to the desired build, as 85well as the autotest control files appropriate for that build. Then, the 86RPC pulls the control file for the suite to be run from the dev server and 87uses it to create the suite job with the autotest frontend. 88 89 +----------------+ 90 | Google Storage | Client 91 +----------------+ | 92 | ^ | create_suite_job() 93 payloads/ | | | 94 control files | | request | 95 V | V 96 +-------------+ download request +--------------------------+ 97 | |<----------------------| | 98 | Dev Server | | Autotest Frontend (AFE) | 99 | |---------------------->| | 100 +-------------+ suite control file +--------------------------+ 101 | 102 V 103 Suite Job (hostless) 104 105- Reimage and Run 106The overall process is to schedule all the tests, and then wait for the tests 107to complete. 108 109- The Reimaging Process 110 111As an artifact of an old implementation, the number of machines to use 112is called the 'sharding_factor', and the default is defined in the [CROS] 113section of global_config.ini. 114 115There used to be a 'num' parameter to control the maximum number of 116machines, but it does not do anything any more. 117 118A test control file can specify a list of DEPENDENCIES, which are really just 119the set of labels a host needs to have in order for that test to be scheduled 120on it. In the case of a dynamic_suite, many tests in the suite may have 121DEPENDENCIES specified. All tests are scheduled with the DEPENDENCIES that 122they specify, along with any suite dependencies that were specified, and the 123scheduler will find and provision a host capable of running the test. 124 125- Scheduling Suites 126A Suite instance uses the labels specified in the suite dependencies to 127schedule tests across all the hosts in the pool. It then waits for all these 128jobs. As an optimization, the Dev server stages the payloads necessary to 129run a suite in the background _after_ it has completed all the things 130necessary for reimaging. Before running a suite, reimage_and_run() calls out 131to the Dev server and blocks until it's completed staging all build artifacts 132needed to run test suites. 133 134Step by step: 1350) At instantiation time, find all appropriate control files for this suite 136 that were included in the build to be tested. To do this, we consult the 137 Dev Server, where all these control files are staged. 138 139 +------------+ control files? +--------------------------+ 140 | |<----------------------| | 141 | Dev Server | | Autotest Frontend (AFE) | 142 | |---------------------->| [Suite Job] | 143 +------------+ control files! +--------------------------+ 144 1451) Now that the Suite instance exists, it schedules jobs for every control 146 file it deemed appropriate, to be run on the hosts that were labeled 147 by the provisioning. We stuff keyvals into these jobs, indicating what 148 build they were testing and which suite they were for. 149 150 +--------------------------+ Job for VersLabel +--------+ 151 | |------------------------>| Host 1 | VersLabel 152 | Autotest Frontend (AFE) | +--------+ +--------+ 153 | [Suite Job] |----------->| Host 2 | 154 +--------------------------+ Job for +--------+ 155 | ^ VersLabel VersLabel 156 | | 157 +----------------+ 158 One job per test 159 {'build': build/name, 160 'suite': suite_name} 161 1622) Now that all jobs are scheduled, they'll be doled out as labeled hosts 163 finish their assigned work and become available again. 164 165- Waiting on Suites 1660) As we clean up each test job, we check to see if any crashes occurred. If 167 they did, we look at the 'build' keyval in the job to see which build's debug 168 symbols we'll need to symbolicate the crash dump we just found. 169 1701) Using this info, we tell a special Crash Server to stage the required debug 171 symbols. Once that's done, we ask the Crash Server to use those symbols to 172 symbolicate the crash dump in question. 173 174 +----------------+ 175 | Google Storage | 176 +----------------+ 177 | ^ 178 symbols! | | symbols? 179 V | 180 +------------+ stage symbols for build +--------------------------+ 181 | |<--------------------------| | 182 | Crash | | | 183 | Server | dump to symbolicate | Autotest Frontend (AFE) | 184 | |<--------------------------| [Suite Job] | 185 | |-------------------------->| | 186 +------------+ symbolicated dump +--------------------------+ 187 1882) As jobs finish, we record their success or failure in the status of the suite 189 job. We also record a 'job keyval' in the suite job for each test, noting 190 the job ID and job owner. This can be used to refer to test logs later. 1913) Once all jobs are complete, status is recorded for the suite job, and the 192 job_repo_url host attribute is removed from all hosts used by the suite. 193 194""" 195 196 197# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py. 198 199class _SuiteSpec(object): 200 """This class contains the info that defines a suite run.""" 201 202 _REQUIRED_KEYWORDS = { 203 'board': str, 204 'builds': dict, 205 'name': str, 206 'job': base_job.base_job, 207 'devserver_url': str, 208 } 209 210 _VERSION_PREFIXES = frozenset(( 211 provision.CROS_VERSION_PREFIX, 212 provision.CROS_ANDROID_VERSION_PREFIX, 213 )) 214 215 def __init__( 216 self, 217 builds=None, 218 board=None, 219 name=None, 220 job=None, 221 devserver_url=None, 222 pool=None, 223 check_hosts=True, 224 add_experimental=True, 225 file_bugs=False, 226 max_runtime_mins=24*60, 227 timeout_mins=24*60, 228 suite_dependencies=None, 229 bug_template=None, 230 priority=priorities.Priority.DEFAULT, 231 predicate=None, 232 wait_for_results=True, 233 job_retry=False, 234 max_retries=None, 235 offload_failures_only=False, 236 test_source_build=None, 237 run_prod_code=False, 238 delay_minutes=0, 239 job_keyvals=None, 240 test_args=None, 241 child_dependencies=(), 242 test_names=None, 243 **dargs): 244 """ 245 Vets arguments for reimage_and_run() and populates self with supplied 246 values. 247 248 Currently required args: 249 @param builds: the builds to install e.g. 250 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 251 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 252 @param board: which kind of devices to reimage. 253 @param name: a value of the SUITE control file variable to search for. 254 @param job: an instance of client.common_lib.base_job representing the 255 currently running suite job. 256 @param devserver_url: url to the selected devserver. 257 258 Currently supported optional args: 259 @param pool: the pool of machines to use for scheduling purposes. 260 @param check_hosts: require appropriate hosts to be available now. 261 @param add_experimental: schedule experimental tests as well, or not. 262 @param file_bugs: File bugs when tests in this suite fail. 263 @param max_runtime_mins: Max runtime in mins for each of the sub-jobs 264 this suite will run. 265 @param timeout_mins: Max lifetime in minutes for each of the sub-jobs 266 that this suite runs. 267 @param suite_dependencies: A list of strings of suite level 268 dependencies, which act just like test 269 dependencies and are appended to each test's 270 set of dependencies at job creation time. 271 A string of comma seperated labels is 272 accepted for backwards compatibility. 273 @param bug_template: A template dictionary specifying the default bug 274 filing options for failures in this suite. 275 @param priority: Integer priority level. Higher is more important. 276 @param predicate: Optional argument. If present, should be a function 277 mapping ControlData objects to True if they should be 278 included in suite. If argument is absent, suite 279 behavior will default to creating a suite of based 280 on the SUITE field of control files. 281 @param wait_for_results: Set to False to run the suite job without 282 waiting for test jobs to finish. 283 @param job_retry: Set to True to enable job-level retry. 284 @param max_retries: Maximum retry limit at suite level if not None. 285 Regardless how many times each individual test 286 has been retried, the total number of retries 287 happening in the suite can't exceed max_retries. 288 @param offload_failures_only: Only enable gs_offloading for failed 289 jobs. 290 @param test_source_build: Build that contains the server-side test code, 291 e.g., it can be the value of builds['cros-version:'] or 292 builds['fw-version:']. None uses the server-side test code from 293 builds['cros-version:']. 294 @param run_prod_code: If true, the suite will run the test code that 295 lives in prod aka the test code currently on the 296 lab servers. 297 @param delay_minutes: Delay the creation of test jobs for a given number 298 of minutes. 299 @param job_keyvals: General job keyvals to be inserted into keyval file 300 @param test_args: A dict of args passed all the way to each individual 301 test that will be actually ran. 302 @param child_dependencies: (optional) list of dependency strings 303 to be added as dependencies to child jobs. 304 @param test_names: (optional) if provided, Suite will consist of the 305 tests named in this list. 306 @param **dargs: these arguments will be ignored. This allows us to 307 deprecate and remove arguments in ToT while not 308 breaking branch builds. 309 """ 310 self._check_init_params( 311 board=board, 312 builds=builds, 313 name=name, 314 job=job, 315 devserver_url=devserver_url) 316 317 self.board = 'board:%s' % board 318 self.builds = builds 319 self.name = name 320 self.job = job 321 self.pool = ('pool:%s' % pool) if pool else pool 322 self.check_hosts = check_hosts 323 self.add_experimental = add_experimental 324 self.file_bugs = file_bugs 325 self.dependencies = {'': []} 326 self.max_runtime_mins = max_runtime_mins 327 self.timeout_mins = timeout_mins 328 self.bug_template = {} if bug_template is None else bug_template 329 self.priority = priority 330 self.wait_for_results = wait_for_results 331 self.job_retry = job_retry 332 self.max_retries = max_retries 333 self.offload_failures_only = offload_failures_only 334 self.run_prod_code = run_prod_code 335 self.delay_minutes = delay_minutes 336 self.job_keyvals = job_keyvals 337 self.test_args = test_args 338 self.child_dependencies = child_dependencies 339 340 self._init_predicate(predicate, test_names) 341 self._init_suite_dependencies(suite_dependencies) 342 self._init_devserver(devserver_url) 343 self._init_test_source_build(test_source_build) 344 self._translate_builds() 345 self._add_builds_to_suite_deps() 346 347 for key, value in dargs.iteritems(): 348 warnings.warn('Ignored key %r was passed to suite with value %r' 349 % (key, value)) 350 351 def _check_init_params(self, **kwargs): 352 for key, expected_type in self._REQUIRED_KEYWORDS.iteritems(): 353 value = kwargs.get(key) 354 # TODO(ayatane): `not value` includes both the cases where value is 355 # None and where value is the correct type, but empty (e.g., empty 356 # dict). It looks like this is NOT the intended behavior, but I'm 357 # hesitant to remove it in case something is actually relying on 358 # this behavior. 359 if not value or not isinstance(value, expected_type): 360 raise error.SuiteArgumentException( 361 'reimage_and_run() needs %s=<%r>' 362 % (key, expected_type)) 363 364 def _init_predicate(self, predicate, test_names): 365 """Initialize predicate attribute.""" 366 if test_names: 367 self.predicate = Suite.test_name_in_list_predicate(test_names) 368 return 369 370 if predicate: 371 self.predicate = predicate 372 return 373 374 self.predicate = Suite.name_in_tag_predicate(self.name) 375 376 def _init_suite_dependencies(self, suite_dependencies): 377 """Initialize suite dependencies attribute.""" 378 if suite_dependencies is None: 379 self.suite_dependencies = [] 380 elif isinstance(suite_dependencies, str): 381 self.suite_dependencies = [dep.strip(' ') for dep 382 in suite_dependencies.split(',')] 383 else: 384 self.suite_dependencies = suite_dependencies 385 386 def _init_devserver(self, devserver_url): 387 """Initialize devserver attribute.""" 388 self.devserver = dev_server.ImageServer(devserver_url) 389 390 def _init_test_source_build(self, test_source_build): 391 """Initialize test_source_build attribute.""" 392 if test_source_build: 393 test_source_build = self.devserver.translate(test_source_build) 394 395 self.test_source_build = Suite.get_test_source_build( 396 self.builds, test_source_build=test_source_build) 397 398 def _translate_builds(self): 399 """Translate build names if they are in LATEST format.""" 400 for prefix in self._VERSION_PREFIXES: 401 if prefix in self.builds: 402 translated_build = self.devserver.translate( 403 self.builds[prefix]) 404 self.builds[prefix] = translated_build 405 406 def _add_builds_to_suite_deps(self): 407 """Add builds to suite_dependencies. 408 409 To support provision both CrOS and firmware, option builds are added to 410 _SuiteSpec, e.g., 411 412 builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0', 413 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 414 415 version_prefix+build should make it into each test as a DEPENDENCY. 416 The easiest way to do this is to tack it onto the suite_dependencies. 417 """ 418 self.suite_dependencies.extend( 419 provision.join(version_prefix, build) 420 for version_prefix, build in self.builds.iteritems() 421 ) 422 423 424class _ProvisionSuiteSpec(_SuiteSpec): 425 426 def __init__(self, num_required, **kwargs): 427 self.num_required = num_required 428 super(_ProvisionSuiteSpec, self).__init__(**kwargs) 429 430 431def run_provision_suite(**dargs): 432 """ 433 Run a provision suite. 434 435 Will re-image a number of devices (of the specified board) with the 436 provided builds by scheduling dummy_Pass. 437 438 @param job: an instance of client.common_lib.base_job representing the 439 currently running suite job. 440 441 @raises AsynchronousBuildFailure: if there was an issue finishing staging 442 from the devserver. 443 @raises MalformedDependenciesException: if the dependency_info file for 444 the required build fails to parse. 445 """ 446 spec = _ProvisionSuiteSpec(**dargs) 447 448 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 449 user=spec.job.user, debug=False) 450 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 451 user=spec.job.user, debug=False) 452 453 try: 454 my_job_id = int(tko_utils.get_afe_job_id(spec.job.tag)) 455 logging.debug('Determined own job id: %d', my_job_id) 456 except (TypeError, ValueError): 457 my_job_id = None 458 logging.warning('Could not determine own job id.') 459 460 suite = ProvisionSuite( 461 tag=spec.name, 462 builds=spec.builds, 463 board=spec.board, 464 devserver=spec.devserver, 465 num_required=spec.num_required, 466 afe=afe, 467 tko=tko, 468 pool=spec.pool, 469 results_dir=spec.job.resultdir, 470 max_runtime_mins=spec.max_runtime_mins, 471 timeout_mins=spec.timeout_mins, 472 file_bugs=spec.file_bugs, 473 suite_job_id=my_job_id, 474 extra_deps=spec.suite_dependencies, 475 priority=spec.priority, 476 wait_for_results=spec.wait_for_results, 477 job_retry=spec.job_retry, 478 max_retries=spec.max_retries, 479 offload_failures_only=spec.offload_failures_only, 480 test_source_build=spec.test_source_build, 481 run_prod_code=spec.run_prod_code, 482 job_keyvals=spec.job_keyvals, 483 test_args=spec.test_args, 484 child_dependencies=spec.child_dependencies, 485 ) 486 487 _run_suite_with_spec(suite, spec) 488 489 logging.debug('Returning from dynamic_suite.run_provision_suite') 490 491 492def reimage_and_run(**dargs): 493 """ 494 Backward-compatible API for dynamic_suite. 495 496 Will re-image a number of devices (of the specified board) with the 497 provided builds, and then run the indicated test suite on them. 498 Guaranteed to be compatible with any build from stable to dev. 499 500 @param dargs: Dictionary containing the arguments passed to _SuiteSpec(). 501 @raises AsynchronousBuildFailure: if there was an issue finishing staging 502 from the devserver. 503 @raises MalformedDependenciesException: if the dependency_info file for 504 the required build fails to parse. 505 """ 506 suite_spec = _SuiteSpec(**dargs) 507 508 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 509 user=suite_spec.job.user, debug=False) 510 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 511 user=suite_spec.job.user, debug=False) 512 513 try: 514 my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag)) 515 logging.debug('Determined own job id: %d', my_job_id) 516 except (TypeError, ValueError): 517 my_job_id = None 518 logging.warning('Could not determine own job id.') 519 520 _perform_reimage_and_run(suite_spec, afe, tko, suite_job_id=my_job_id) 521 522 logging.debug('Returning from dynamic_suite.reimage_and_run.') 523 524 525def _perform_reimage_and_run(spec, afe, tko, suite_job_id=None): 526 """ 527 Do the work of reimaging hosts and running tests. 528 529 @param spec: a populated _SuiteSpec object. 530 @param afe: an instance of AFE as defined in server/frontend.py. 531 @param tko: an instance of TKO as defined in server/frontend.py. 532 @param suite_job_id: Job id that will act as parent id to all sub jobs. 533 Default: None 534 """ 535 # We can't create the suite until the devserver has finished downloading 536 # control_files and test_suites packages so that we can get the control 537 # files to schedule. 538 if not spec.run_prod_code: 539 _stage_artifacts_for_build(spec.devserver, spec.test_source_build) 540 suite = Suite.create_from_predicates( 541 predicates=[spec.predicate], 542 name=spec.name, 543 builds=spec.builds, 544 board=spec.board, 545 devserver=spec.devserver, 546 afe=afe, 547 tko=tko, 548 pool=spec.pool, 549 results_dir=spec.job.resultdir, 550 max_runtime_mins=spec.max_runtime_mins, 551 timeout_mins=spec.timeout_mins, 552 file_bugs=spec.file_bugs, 553 suite_job_id=suite_job_id, 554 extra_deps=spec.suite_dependencies, 555 priority=spec.priority, 556 wait_for_results=spec.wait_for_results, 557 job_retry=spec.job_retry, 558 max_retries=spec.max_retries, 559 offload_failures_only=spec.offload_failures_only, 560 test_source_build=spec.test_source_build, 561 run_prod_code=spec.run_prod_code, 562 job_keyvals=spec.job_keyvals, 563 test_args=spec.test_args, 564 child_dependencies=spec.child_dependencies, 565 ) 566 _run_suite_with_spec(suite, spec) 567 568 569def _run_suite_with_spec(suite, spec): 570 """ 571 Do the work of reimaging hosts and running tests. 572 573 @param suite: _BaseSuite instance to run. 574 @param spec: a populated _SuiteSpec object. 575 """ 576 _run_suite( 577 suite=suite, 578 job=spec.job, 579 delay_minutes=spec.delay_minutes, 580 bug_template=spec.bug_template) 581 582 583def _run_suite( 584 suite, 585 job, 586 delay_minutes, 587 bug_template): 588 """ 589 Run a suite. 590 591 @param suite: _BaseSuite instance. 592 @param job: an instance of client.common_lib.base_job representing the 593 currently running suite job. 594 @param delay_minutes: Delay the creation of test jobs for a given number 595 of minutes. 596 @param bug_template: A template dictionary specifying the default bug 597 filing options for failures in this suite. 598 """ 599 timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT) 600 utils.write_keyval( 601 job.resultdir, 602 {constants.ARTIFACT_FINISHED_TIME: timestamp}) 603 604 if delay_minutes: 605 logging.debug('delay_minutes is set. Sleeping %d minutes before ' 606 'creating test jobs.', delay_minutes) 607 time.sleep(delay_minutes*60) 608 logging.debug('Finished waiting for %d minutes before creating test ' 609 'jobs.', delay_minutes) 610 611 # Now we get to asychronously schedule tests. 612 suite.schedule(job.record_entry) 613 614 if suite.wait_for_results: 615 logging.debug('Waiting on suite.') 616 suite.wait(job.record_entry) 617 logging.debug('Finished waiting on suite. ' 618 'Returning from _perform_reimage_and_run.') 619 else: 620 logging.info('wait_for_results is set to False, suite job will exit ' 621 'without waiting for test jobs to finish.') 622 623 624def _stage_artifacts_for_build(devserver, build): 625 """Stage artifacts for a suite job. 626 627 @param devserver: devserver to stage artifacts with. 628 @param build: image to stage artifacts for. 629 """ 630 try: 631 devserver.stage_artifacts( 632 image=build, 633 artifacts=['control_files', 'test_suites']) 634 except dev_server.DevServerException as e: 635 # If we can't get the control files, there's nothing to run. 636 raise error.AsynchronousBuildFailure(e) 637 638 639# This function is used by the cros_test_platform suite, to unwrap json-decoded 640# arguments from the cros_test_platform recipe and convert them to byte string. 641# 642# It should not be used for other purposes. It exists in this module simply 643# to limit the number of necessary module imports in cros_test_platform. 644def byteify(input): 645 """Walk a json object, turning unicode strings into byte strings.""" 646 if isinstance(input, dict): 647 return {byteify(key): byteify(value) 648 for key, value in input.iteritems()} 649 elif isinstance(input, list): 650 return [byteify(element) for element in input] 651 elif isinstance(input, unicode): 652 return input.encode('utf-8') 653 else: 654 return input