1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import datetime 6import logging 7import time 8import warnings 9 10import common 11 12from autotest_lib.client.common_lib import base_job 13from autotest_lib.client.common_lib import error 14from autotest_lib.client.common_lib import priorities 15from autotest_lib.client.common_lib import time_utils 16from autotest_lib.client.common_lib import utils 17from autotest_lib.client.common_lib.cros import dev_server 18from autotest_lib.server.cros import provision 19from autotest_lib.server.cros.dynamic_suite import constants 20from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 21from autotest_lib.server.cros.dynamic_suite.suite import ProvisionSuite 22from autotest_lib.server.cros.dynamic_suite.suite import Suite 23from autotest_lib.tko import utils as tko_utils 24 25 26"""CrOS dynamic test suite generation and execution module. 27 28This module implements runtime-generated test suites for CrOS. 29Design doc: http://goto.google.com/suitesv2 30 31Individual tests can declare themselves as a part of one or more 32suites, and the code here enables control files to be written 33that can refer to these "dynamic suites" by name. We also provide 34support for reimaging devices with a given build and running a 35dynamic suite across all reimaged devices. 36 37The public API for defining a suite includes one method: reimage_and_run(). 38A suite control file can be written by importing this module and making 39an appropriate call to this single method. In normal usage, this control 40file will be run in a 'hostless' server-side autotest job, scheduling 41sub-jobs to do the needed reimaging and test running. 42 43Example control file: 44 45import common 46from autotest_lib.server.cros import provision 47from autotest_lib.server.cros.dynamic_suite import dynamic_suite 48 49dynamic_suite.reimage_and_run( 50 builds={provision.CROS_VERSION_PREFIX: build}, board=board, name='bvt', 51 job=job, pool=pool, check_hosts=check_hosts, add_experimental=True, num=num, 52 devserver_url=devserver_url) 53 54This will -- at runtime -- find all control files that contain "bvt" in their 55"SUITE=" clause, schedule jobs to reimage |num| or less devices in the 56specified pool of the specified board with the specified build and, upon 57completion of those jobs, schedule and wait for jobs that run all the tests it 58discovered. 59 60Suites can be run by using the atest command-line tool: 61 atest suite create -b <board> -i <build/name> <suite> 62e.g. 63 atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt 64 65------------------------------------------------------------------------- 66Implementation details 67 68A Suite instance represents a single test suite, defined by some predicate 69run over all known control files. The simplest example is creating a Suite 70by 'name'. 71 72create_suite_job() takes the parameters needed to define a suite run (board, 73build to test, machine pool, and which suite to run), ensures important 74preconditions are met, finds the appropraite suite control file, and then 75schedules the hostless job that will do the rest of the work. 76 77Note that we have more than one Dev server in our test lab architecture. 78We currently load balance per-build being tested, so one and only one dev 79server is used by any given run through the reimaging/testing flow. 80 81- create_suite_job() 82The primary role of create_suite_job() is to ensure that the required 83artifacts for the build to be tested are staged on the dev server. This 84includes payloads required to autoupdate machines to the desired build, as 85well as the autotest control files appropriate for that build. Then, the 86RPC pulls the control file for the suite to be run from the dev server and 87uses it to create the suite job with the autotest frontend. 88 89 +----------------+ 90 | Google Storage | Client 91 +----------------+ | 92 | ^ | create_suite_job() 93 payloads/ | | | 94 control files | | request | 95 V | V 96 +-------------+ download request +--------------------------+ 97 | |<----------------------| | 98 | Dev Server | | Autotest Frontend (AFE) | 99 | |---------------------->| | 100 +-------------+ suite control file +--------------------------+ 101 | 102 V 103 Suite Job (hostless) 104 105- Reimage and Run 106The overall process is to schedule all the tests, and then wait for the tests 107to complete. 108 109- The Reimaging Process 110 111As an artifact of an old implementation, the number of machines to use 112is called the 'sharding_factor', and the default is defined in the [CROS] 113section of global_config.ini. This can be overridden by passing a 'num=N' 114parameter to create_suite_job(), which is piped through to reimage_and_run() 115just like the 'build' and 'board' parameters are. However, with provisioning, 116this machine accounting hasn't been implemented nor removed. However, 'num' is 117still passed around, as it might be used one day. 118 119A test control file can specify a list of DEPENDENCIES, which are really just 120the set of labels a host needs to have in order for that test to be scheduled 121on it. In the case of a dynamic_suite, many tests in the suite may have 122DEPENDENCIES specified. All tests are scheduled with the DEPENDENCIES that 123they specify, along with any suite dependencies that were specified, and the 124scheduler will find and provision a host capable of running the test. 125 126- Scheduling Suites 127A Suite instance uses the labels specified in the suite dependencies to 128schedule tests across all the hosts in the pool. It then waits for all these 129jobs. As an optimization, the Dev server stages the payloads necessary to 130run a suite in the background _after_ it has completed all the things 131necessary for reimaging. Before running a suite, reimage_and_run() calls out 132to the Dev server and blocks until it's completed staging all build artifacts 133needed to run test suites. 134 135Step by step: 1360) At instantiation time, find all appropriate control files for this suite 137 that were included in the build to be tested. To do this, we consult the 138 Dev Server, where all these control files are staged. 139 140 +------------+ control files? +--------------------------+ 141 | |<----------------------| | 142 | Dev Server | | Autotest Frontend (AFE) | 143 | |---------------------->| [Suite Job] | 144 +------------+ control files! +--------------------------+ 145 1461) Now that the Suite instance exists, it schedules jobs for every control 147 file it deemed appropriate, to be run on the hosts that were labeled 148 by the provisioning. We stuff keyvals into these jobs, indicating what 149 build they were testing and which suite they were for. 150 151 +--------------------------+ Job for VersLabel +--------+ 152 | |------------------------>| Host 1 | VersLabel 153 | Autotest Frontend (AFE) | +--------+ +--------+ 154 | [Suite Job] |----------->| Host 2 | 155 +--------------------------+ Job for +--------+ 156 | ^ VersLabel VersLabel 157 | | 158 +----------------+ 159 One job per test 160 {'build': build/name, 161 'suite': suite_name} 162 1632) Now that all jobs are scheduled, they'll be doled out as labeled hosts 164 finish their assigned work and become available again. 165 166- Waiting on Suites 1670) As we clean up each test job, we check to see if any crashes occurred. If 168 they did, we look at the 'build' keyval in the job to see which build's debug 169 symbols we'll need to symbolicate the crash dump we just found. 170 1711) Using this info, we tell a special Crash Server to stage the required debug 172 symbols. Once that's done, we ask the Crash Server to use those symbols to 173 symbolicate the crash dump in question. 174 175 +----------------+ 176 | Google Storage | 177 +----------------+ 178 | ^ 179 symbols! | | symbols? 180 V | 181 +------------+ stage symbols for build +--------------------------+ 182 | |<--------------------------| | 183 | Crash | | | 184 | Server | dump to symbolicate | Autotest Frontend (AFE) | 185 | |<--------------------------| [Suite Job] | 186 | |-------------------------->| | 187 +------------+ symbolicated dump +--------------------------+ 188 1892) As jobs finish, we record their success or failure in the status of the suite 190 job. We also record a 'job keyval' in the suite job for each test, noting 191 the job ID and job owner. This can be used to refer to test logs later. 1923) Once all jobs are complete, status is recorded for the suite job, and the 193 job_repo_url host attribute is removed from all hosts used by the suite. 194 195""" 196 197 198# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py. 199 200class _SuiteSpec(object): 201 """This class contains the info that defines a suite run.""" 202 203 _REQUIRED_KEYWORDS = { 204 'board': str, 205 'builds': dict, 206 'name': str, 207 'job': base_job.base_job, 208 'devserver_url': str, 209 } 210 211 _VERSION_PREFIXES = frozenset(( 212 provision.CROS_VERSION_PREFIX, 213 provision.ANDROID_BUILD_VERSION_PREFIX, 214 )) 215 216 def __init__( 217 self, 218 builds=None, 219 board=None, 220 name=None, 221 job=None, 222 devserver_url=None, 223 pool=None, 224 num=None, 225 check_hosts=True, 226 add_experimental=True, 227 file_bugs=False, 228 max_runtime_mins=24*60, 229 timeout_mins=24*60, 230 suite_dependencies=None, 231 bug_template=None, 232 priority=priorities.Priority.DEFAULT, 233 predicate=None, 234 wait_for_results=True, 235 job_retry=False, 236 max_retries=None, 237 offload_failures_only=False, 238 test_source_build=None, 239 run_prod_code=False, 240 delay_minutes=0, 241 job_keyvals=None, 242 test_args = None, 243 **dargs): 244 """ 245 Vets arguments for reimage_and_run() and populates self with supplied 246 values. 247 248 Currently required args: 249 @param builds: the builds to install e.g. 250 {'cros-version:': 'x86-alex-release/R18-1655.0.0', 251 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 252 @param board: which kind of devices to reimage. 253 @param name: a value of the SUITE control file variable to search for. 254 @param job: an instance of client.common_lib.base_job representing the 255 currently running suite job. 256 @param devserver_url: url to the selected devserver. 257 258 Currently supported optional args: 259 @param pool: the pool of machines to use for scheduling purposes. 260 @param num: the maximum number of devices to reimage. 261 @param check_hosts: require appropriate hosts to be available now. 262 @param add_experimental: schedule experimental tests as well, or not. 263 @param file_bugs: File bugs when tests in this suite fail. 264 @param max_runtime_mins: Max runtime in mins for each of the sub-jobs 265 this suite will run. 266 @param timeout_mins: Max lifetime in minutes for each of the sub-jobs 267 that this suite runs. 268 @param suite_dependencies: A list of strings of suite level 269 dependencies, which act just like test 270 dependencies and are appended to each test's 271 set of dependencies at job creation time. 272 A string of comma seperated labels is 273 accepted for backwards compatibility. 274 @param bug_template: A template dictionary specifying the default bug 275 filing options for failures in this suite. 276 @param priority: Integer priority level. Higher is more important. 277 @param predicate: Optional argument. If present, should be a function 278 mapping ControlData objects to True if they should be 279 included in suite. If argument is absent, suite 280 behavior will default to creating a suite of based 281 on the SUITE field of control files. 282 @param wait_for_results: Set to False to run the suite job without 283 waiting for test jobs to finish. 284 @param job_retry: Set to True to enable job-level retry. 285 @param max_retries: Maximum retry limit at suite level if not None. 286 Regardless how many times each individual test 287 has been retried, the total number of retries 288 happening in the suite can't exceed max_retries. 289 @param offload_failures_only: Only enable gs_offloading for failed 290 jobs. 291 @param test_source_build: Build that contains the server-side test code, 292 e.g., it can be the value of builds['cros-version:'] or 293 builds['fw-version:']. None uses the server-side test code from 294 builds['cros-version:']. 295 @param run_prod_code: If true, the suite will run the test code that 296 lives in prod aka the test code currently on the 297 lab servers. 298 @param delay_minutes: Delay the creation of test jobs for a given number 299 of minutes. 300 @param job_keyvals: General job keyvals to be inserted into keyval file 301 @param test_args: A dict of args passed all the way to each individual 302 test that will be actually ran. 303 @param **dargs: these arguments will be ignored. This allows us to 304 deprecate and remove arguments in ToT while not 305 breaking branch builds. 306 """ 307 self._check_init_params( 308 board=board, 309 builds=builds, 310 name=name, 311 job=job, 312 devserver_url=devserver_url) 313 314 self.board = 'board:%s' % board 315 self.builds = builds 316 self.name = name 317 self.job = job 318 self.pool = ('pool:%s' % pool) if pool else pool 319 self.num = num 320 self.check_hosts = check_hosts 321 self.add_experimental = add_experimental 322 self.file_bugs = file_bugs 323 self.dependencies = {'': []} 324 self.max_runtime_mins = max_runtime_mins 325 self.timeout_mins = timeout_mins 326 self.bug_template = {} if bug_template is None else bug_template 327 self.priority = priority 328 self.wait_for_results = wait_for_results 329 self.job_retry = job_retry 330 self.max_retries = max_retries 331 self.offload_failures_only = offload_failures_only 332 self.run_prod_code = run_prod_code 333 self.delay_minutes = delay_minutes 334 self.job_keyvals = job_keyvals 335 self.test_args = test_args 336 337 self._init_predicate(predicate) 338 self._init_suite_dependencies(suite_dependencies) 339 self._init_devserver(devserver_url) 340 self._init_test_source_build(test_source_build) 341 self._translate_builds() 342 self._add_builds_to_suite_deps() 343 344 for key, value in dargs.iteritems(): 345 warnings.warn('Ignored key %r was passed to suite with value %r' 346 % (key, value)) 347 348 def _check_init_params(self, **kwargs): 349 for key, expected_type in self._REQUIRED_KEYWORDS.iteritems(): 350 value = kwargs.get(key) 351 # TODO(ayatane): `not value` includes both the cases where value is 352 # None and where value is the correct type, but empty (e.g., empty 353 # dict). It looks like this is NOT the intended behavior, but I'm 354 # hesitant to remove it in case something is actually relying on 355 # this behavior. 356 if not value or not isinstance(value, expected_type): 357 raise error.SuiteArgumentException( 358 'reimage_and_run() needs %s=<%r>' 359 % (key, expected_type)) 360 361 def _init_predicate(self, predicate): 362 """Initialize predicate attribute.""" 363 if predicate is None: 364 self.predicate = Suite.name_in_tag_predicate(self.name) 365 else: 366 self.predicate = predicate 367 368 369 def _init_suite_dependencies(self, suite_dependencies): 370 """Initialize suite dependencies attribute.""" 371 if suite_dependencies is None: 372 self.suite_dependencies = [] 373 elif isinstance(suite_dependencies, str): 374 self.suite_dependencies = [dep.strip(' ') for dep 375 in suite_dependencies.split(',')] 376 else: 377 self.suite_dependencies = suite_dependencies 378 379 def _init_devserver(self, devserver_url): 380 """Initialize devserver attribute.""" 381 if provision.ANDROID_BUILD_VERSION_PREFIX in self.builds: 382 self.devserver = dev_server.AndroidBuildServer(devserver_url) 383 else: 384 self.devserver = dev_server.ImageServer(devserver_url) 385 386 def _init_test_source_build(self, test_source_build): 387 """Initialize test_source_build attribute.""" 388 if test_source_build: 389 test_source_build = self.devserver.translate(test_source_build) 390 391 self.test_source_build = Suite.get_test_source_build( 392 self.builds, test_source_build=test_source_build) 393 394 def _translate_builds(self): 395 """Translate build names if they are in LATEST format.""" 396 for prefix in self._VERSION_PREFIXES: 397 if prefix in self.builds: 398 translated_build = self.devserver.translate( 399 self.builds[prefix]) 400 self.builds[prefix] = translated_build 401 402 def _add_builds_to_suite_deps(self): 403 """Add builds to suite_dependencies. 404 405 To support provision both CrOS and firmware, option builds are added to 406 _SuiteSpec, e.g., 407 408 builds = {'cros-version:': 'x86-alex-release/R18-1655.0.0', 409 'fwrw-version:': 'x86-alex-firmware/R36-5771.50.0'} 410 411 version_prefix+build should make it into each test as a DEPENDENCY. 412 The easiest way to do this is to tack it onto the suite_dependencies. 413 """ 414 self.suite_dependencies.extend( 415 provision.join(version_prefix, build) 416 for version_prefix, build in self.builds.iteritems() 417 ) 418 419 420def run_provision_suite(**dargs): 421 """ 422 Run a provision suite. 423 424 Will re-image a number of devices (of the specified board) with the 425 provided builds by scheduling dummy_Pass. 426 427 @param job: an instance of client.common_lib.base_job representing the 428 currently running suite job. 429 430 @raises AsynchronousBuildFailure: if there was an issue finishing staging 431 from the devserver. 432 @raises MalformedDependenciesException: if the dependency_info file for 433 the required build fails to parse. 434 """ 435 spec = _SuiteSpec(**dargs) 436 437 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 438 user=spec.job.user, debug=False) 439 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 440 user=spec.job.user, debug=False) 441 442 try: 443 my_job_id = int(tko_utils.get_afe_job_id(spec.job.tag)) 444 logging.debug('Determined own job id: %d', my_job_id) 445 except ValueError: 446 my_job_id = None 447 logging.warning('Could not determine own job id.') 448 449 suite = ProvisionSuite( 450 tag=spec.name, 451 builds=spec.builds, 452 board=spec.board, 453 devserver=spec.devserver, 454 count=1, 455 afe=afe, 456 tko=tko, 457 pool=spec.pool, 458 results_dir=spec.job.resultdir, 459 max_runtime_mins=spec.max_runtime_mins, 460 timeout_mins=spec.timeout_mins, 461 file_bugs=spec.file_bugs, 462 suite_job_id=my_job_id, 463 extra_deps=spec.suite_dependencies, 464 priority=spec.priority, 465 wait_for_results=spec.wait_for_results, 466 job_retry=spec.job_retry, 467 max_retries=spec.max_retries, 468 offload_failures_only=spec.offload_failures_only, 469 test_source_build=spec.test_source_build, 470 run_prod_code=spec.run_prod_code, 471 job_keyvals=spec.job_keyvals, 472 test_args=spec.test_args) 473 474 _run_suite_with_spec(suite, spec) 475 476 logging.debug('Returning from dynamic_suite.run_provision_suite') 477 478 479def reimage_and_run(**dargs): 480 """ 481 Backward-compatible API for dynamic_suite. 482 483 Will re-image a number of devices (of the specified board) with the 484 provided builds, and then run the indicated test suite on them. 485 Guaranteed to be compatible with any build from stable to dev. 486 487 @param dargs: Dictionary containing the arguments passed to _SuiteSpec(). 488 @raises AsynchronousBuildFailure: if there was an issue finishing staging 489 from the devserver. 490 @raises MalformedDependenciesException: if the dependency_info file for 491 the required build fails to parse. 492 """ 493 suite_spec = _SuiteSpec(**dargs) 494 495 afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10, 496 user=suite_spec.job.user, debug=False) 497 tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10, 498 user=suite_spec.job.user, debug=False) 499 500 try: 501 my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag)) 502 logging.debug('Determined own job id: %d', my_job_id) 503 except ValueError: 504 my_job_id = None 505 logging.warning('Could not determine own job id.') 506 507 _perform_reimage_and_run(suite_spec, afe, tko, suite_job_id=my_job_id) 508 509 logging.debug('Returning from dynamic_suite.reimage_and_run.') 510 511 512def _perform_reimage_and_run(spec, afe, tko, suite_job_id=None): 513 """ 514 Do the work of reimaging hosts and running tests. 515 516 @param spec: a populated _SuiteSpec object. 517 @param afe: an instance of AFE as defined in server/frontend.py. 518 @param tko: an instance of TKO as defined in server/frontend.py. 519 @param suite_job_id: Job id that will act as parent id to all sub jobs. 520 Default: None 521 """ 522 suite = Suite.create_from_predicates( 523 predicates=[spec.predicate], 524 name=spec.name, 525 builds=spec.builds, 526 board=spec.board, 527 devserver=spec.devserver, 528 afe=afe, 529 tko=tko, 530 pool=spec.pool, 531 results_dir=spec.job.resultdir, 532 max_runtime_mins=spec.max_runtime_mins, 533 timeout_mins=spec.timeout_mins, 534 file_bugs=spec.file_bugs, 535 suite_job_id=suite_job_id, 536 extra_deps=spec.suite_dependencies, 537 priority=spec.priority, 538 wait_for_results=spec.wait_for_results, 539 job_retry=spec.job_retry, 540 max_retries=spec.max_retries, 541 offload_failures_only=spec.offload_failures_only, 542 test_source_build=spec.test_source_build, 543 run_prod_code=spec.run_prod_code, 544 job_keyvals=spec.job_keyvals, 545 test_args=spec.test_args) 546 _run_suite_with_spec(suite, spec) 547 548 549def _run_suite_with_spec(suite, spec): 550 """ 551 Do the work of reimaging hosts and running tests. 552 553 @param suite: _BaseSuite instance to run. 554 @param spec: a populated _SuiteSpec object. 555 """ 556 _run_suite( 557 suite=suite, 558 job=spec.job, 559 run_prod_code=spec.run_prod_code, 560 devserver=spec.devserver, 561 build=spec.test_source_build, 562 delay_minutes=spec.delay_minutes, 563 bug_template=spec.bug_template) 564 565 566def _run_suite( 567 suite, 568 job, 569 run_prod_code, 570 devserver, 571 build, 572 delay_minutes, 573 bug_template): 574 """ 575 Run a suite. 576 577 @param suite: _BaseSuite instance. 578 @param job: an instance of client.common_lib.base_job representing the 579 currently running suite job. 580 @param run_prod_code: whether to use prod test code. 581 @param devserver: devserver for staging artifacts. 582 @param build: the build to install e.g. 'x86-alex-release/R18-1655.0.0' 583 @param delay_minutes: Delay the creation of test jobs for a given number 584 of minutes. 585 @param bug_template: A template dictionary specifying the default bug 586 filing options for failures in this suite. 587 """ 588 # We can't do anything else until the devserver has finished downloading 589 # control_files and test_suites packages so that we can get the control 590 # files we should schedule. 591 if not run_prod_code: 592 _stage_artifacts_for_build(devserver, build) 593 594 timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT) 595 utils.write_keyval( 596 job.resultdir, 597 {constants.ARTIFACT_FINISHED_TIME: timestamp}) 598 599 if delay_minutes: 600 logging.debug('delay_minutes is set. Sleeping %d minutes before ' 601 'creating test jobs.', delay_minutes) 602 time.sleep(delay_minutes*60) 603 logging.debug('Finished waiting for %d minutes before creating test ' 604 'jobs.', delay_minutes) 605 606 # Now we get to asychronously schedule tests. 607 suite.schedule(job.record_entry) 608 609 if suite.wait_for_results: 610 logging.debug('Waiting on suite.') 611 reporter = suite.get_result_reporter(bug_template) 612 suite.wait(job.record_entry, reporter=reporter) 613 logging.debug('Finished waiting on suite. ' 614 'Returning from _perform_reimage_and_run.') 615 else: 616 logging.info('wait_for_results is set to False, suite job will exit ' 617 'without waiting for test jobs to finish.') 618 619 620def _stage_artifacts_for_build(devserver, build): 621 """Stage artifacts for a suite job. 622 623 @param devserver: devserver to stage artifacts with. 624 @param build: image to stage artifacts for. 625 """ 626 try: 627 devserver.stage_artifacts( 628 image=build, 629 artifacts=['control_files', 'test_suites']) 630 except dev_server.DevServerException as e: 631 # If we can't get the control files, there's nothing to run. 632 raise error.AsynchronousBuildFailure(e) 633