1#!/usr/bin/python 2# 3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7 8"""Tool for running suites of tests and waiting for completion. 9 10The desired test suite will be scheduled with autotest. By default, 11this tool will block until the job is complete, printing a summary 12at the end. Error conditions result in exceptions. 13 14This is intended for use only with Chrome OS test suits that leverage the 15dynamic suite infrastructure in server/cros/dynamic_suite.py. 16 17This script exits with one of the following codes: 180 - OK: Suite finished successfully 191 - ERROR: Test(s) failed, or hits its own timeout 202 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out. 213 - INFRA_FAILURE: Infrastructure related issues, e.g. 22 * Lab is down 23 * Too many duts (defined as a constant) in repair failed status 24 * Suite job issues, like bug in dynamic suite, 25 user aborted the suite, lose a drone/all devservers/rpc server, 26 0 tests ran, etc. 27 * provision failed 28 TODO(fdeng): crbug.com/413918, reexamine treating all provision 29 failures as INFRA failures. 304 - SUITE_TIMEOUT: Suite timed out, some tests ran, 31 none failed by the time the suite job was aborted. This will cover, 32 but not limited to, the following cases: 33 * A devserver failure that manifests as a timeout 34 * No DUTs available midway through a suite 35 * Provision/Reset/Cleanup took longer time than expected for new image 36 * A regression in scheduler tick time. 375- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool. 386- INVALID_OPTIONS: If options are not valid. 39""" 40 41import argparse 42import ast 43import collections 44from datetime import datetime 45from datetime import timedelta 46import functools 47import getpass 48import logging 49import os 50import re 51import sys 52import time 53import warnings 54 55import common 56from chromite.lib import buildbot_annotations as annotations 57from chromite.lib import gs 58from chromite.lib import osutils 59 60from django.core import exceptions as django_exceptions 61 62try: 63 from suite_scheduler import config_reader 64 from suite_scheduler import skylab 65except ImportError: 66 # For unittest 67 config_reader = None 68 skylab = None 69 70from autotest_lib.client.common_lib import control_data 71from autotest_lib.client.common_lib import error 72from autotest_lib.client.common_lib import global_config 73from autotest_lib.client.common_lib import priorities 74from autotest_lib.client.common_lib import time_utils 75from autotest_lib.client.common_lib.cros import retry 76from autotest_lib.frontend.afe import rpc_client_lib 77from autotest_lib.frontend.afe.json_rpc import proxy 78from autotest_lib.server import site_utils 79from autotest_lib.server import utils 80from autotest_lib.server.cros.dynamic_suite import constants 81from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 82from autotest_lib.server.cros.dynamic_suite import reporting_utils 83from autotest_lib.server.cros.dynamic_suite import suite_common 84from autotest_lib.server.cros.dynamic_suite import tools 85try: 86 from autotest_lib.site_utils import diagnosis_utils 87except django_exceptions.ImproperlyConfigured as e: 88 if 'Error loading MySQLdb module: libmariadbclient' in str(e): 89 logging.error('Unable to import a necessary MySQLdb module. This is ' 90 'commonly caused by running a command inside[outside] ' 91 'of the chroot but having autotest utility packages ' 92 'that were build outside[inside] the chroot. ' 93 'Please re-run utils/build_externals.py inside[outside] ' 94 'of the chroot accordingly.') 95 raise 96from autotest_lib.site_utils import run_suite_common 97 98CONFIG = global_config.global_config 99 100_DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value( 101 'SERVER', 'hostname', type=str) 102_URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str) 103_ENABLE_RUN_SUITE_TRAMPOLINE = CONFIG.get_config_value( 104 'CROS', 'enable_run_suite_trampoline', type=bool, default=False) 105 106_MIGRATION_CONFIG_FILE = 'migration_config.ini' 107_MIGRATION_CONFIG_BUCKET = 'suite-scheduler.google.com.a.appspot.com' 108_TRAMPOLINE_CONFIG = 'gs://%s/%s' % (_MIGRATION_CONFIG_BUCKET, 109 _MIGRATION_CONFIG_FILE) 110 111# Minimum RPC timeout setting for calls expected to take long time, e.g., 112# create_suite_job. If default socket time (socket.getdefaulttimeout()) is 113# None or greater than this value, the default will be used. 114# The value here is set to be the same as the timeout for the RetryingAFE object 115# so long running RPCs can wait long enough before being aborted. 116_MIN_RPC_TIMEOUT = 600 117 118# Number of days back to search for existing job. 119_SEARCH_JOB_MAX_DAYS = 14 120 121_PROVISION_SUITE = 'provision' 122 123 124@functools.total_ordering 125class _ReturnResult(object): 126 """Represents overall result of run_suite operation. 127 128 _ReturnResult instances sort based on priority (the order in 129 _RETURN_RESULTS). 130 131 Furthermore, _ReturnResult instances can be combined by bitwise or 132 ("union"), which returns the instance with the higher priority 133 between the two (the instance with higher priority is a "superset" 134 of the other). 135 136 Do not create new instances of this; use _RETURN_RESULTS instead. 137 """ 138 139 def __init__(self, return_code, message): 140 self.return_code = return_code 141 self.message = message 142 143 def __repr__(self): 144 return '<{cls} {key}, {this.return_code}, {this.message}>'.format( 145 cls=type(self).__name__, 146 key=self._getkey(), 147 this=self) 148 149 def __gt__(self, other): 150 if isinstance(other, type(self)): 151 return self._getkey() > other._getkey() 152 else: 153 return NotImplemented 154 155 def __eq__(self, other): 156 if isinstance(other, type(self)): 157 return (self.return_code == other.return_code 158 and self.message == other.message) 159 else: 160 return NotImplemented 161 162 def __hash__(self): 163 return hash(self.return_code) ^ hash(self.message) 164 165 def __or__(self, other): 166 if isinstance(other, type(self)): 167 if self > other: 168 return self 169 else: 170 return other 171 else: 172 return NotImplemented 173 174 def _getkey(self): 175 """Return sort key.""" 176 return _RETURN_RESULTS_LIST.index(self) 177 178 def suite_result(self, output_dict=None): 179 """Make a SuiteResult using this _ReturnResult. 180 181 @param output_dict: output_dict to merge into SuiteResult. 182 """ 183 if output_dict is None: 184 output_dict = dict() 185 else: 186 output_dict = output_dict.copy() 187 if self.message: 188 output_dict['return_message'] = self.message 189 return run_suite_common.SuiteResult(self.return_code, output_dict) 190 191 192_RETURN_RESULTS = collections.OrderedDict([ 193 ('ok', _ReturnResult(run_suite_common.RETURN_CODES.OK, '')), 194 195 ('test_warning', _ReturnResult( 196 run_suite_common.RETURN_CODES.WARNING, 'Test job raised warning.')), 197 ('suite_warning', _ReturnResult( 198 run_suite_common.RETURN_CODES.WARNING, 'Suite job raised warning.')), 199 ('test_retry', _ReturnResult( 200 run_suite_common.RETURN_CODES.WARNING, 'Tests were retried.')), 201 202 ('test_aborted_prestart', _ReturnResult( 203 run_suite_common.RETURN_CODES.SUITE_TIMEOUT, 204 'Tests were aborted before running; suite must have timed out.')), 205 # This really indicates a user action or an infra failure. But, suite 206 # timeouts cause similar fauilres in the individual tests, so we must 207 # classify these lower than suite_timeout. In case of a suite_timeout, the 208 # result from the suite job will promote the result to suite_timeout. 209 ('test_aborted_mystery', 210 _ReturnResult( 211 run_suite_common.RETURN_CODES.SUITE_TIMEOUT, 212 'Tests were aborted after running, but before timeout; ' 213 'Test was manually aborted or parsing results failed: ' 214 'crbug.com/796348.')), 215 ('suite_timeout', _ReturnResult( 216 run_suite_common.RETURN_CODES.SUITE_TIMEOUT, 'Suite job timed out.')), 217 218 ('test_views_missing', _ReturnResult( 219 run_suite_common.RETURN_CODES.INFRA_FAILURE, 'No test views found.')), 220 ('suite_failed', _ReturnResult( 221 run_suite_common.RETURN_CODES.INFRA_FAILURE, 'Suite job failed.')), 222 ('provision_failed', _ReturnResult( 223 run_suite_common.RETURN_CODES.INFRA_FAILURE, 'Provisioning failed.')), 224 225 ('test_failure', _ReturnResult( 226 run_suite_common.RETURN_CODES.ERROR, 'Tests failed.')), 227]) 228_RETURN_RESULTS_LIST = list(_RETURN_RESULTS.values()) 229 230 231def bool_str(x): 232 """Boolean string type for option arguments. 233 234 @param x: string representation of boolean value. 235 236 """ 237 if x == 'True': 238 return True 239 elif x == 'False': 240 return False 241 else: 242 raise argparse.ArgumentTypeError( 243 '%s is not one of True or False' % (x,)) 244 245 246def _get_priority_value(x): 247 """Convert a priority representation to its int value. 248 249 Priorities can be described either by an int value (possibly as a string) 250 or a name string. This function coerces both forms to an int value. 251 252 This function is intended for casting command line arguments during 253 parsing. 254 255 @param x: priority value as an int, int string, or name string 256 257 @returns: int value of priority 258 """ 259 try: 260 return int(x) 261 except ValueError: 262 try: 263 return priorities.Priority.get_value(x) 264 except AttributeError: 265 raise argparse.ArgumentTypeError( 266 'Unknown priority level %s. Try one of %s.' 267 % (x, ', '.join(priorities.Priority.names))) 268 269 270def make_parser(): 271 """Make ArgumentParser instance for run_suite.py.""" 272 parser = argparse.ArgumentParser( 273 usage="%(prog)s [options]") 274 parser.add_argument("-b", "--board", dest="board") 275 parser.add_argument( 276 "--model", 277 help="The device model to run tests against. For non-unified " 278 "builds, model and board are synonymous, but board is more " 279 "accurate in some cases. Only pass this option if your build " 280 "is a unified build.", 281 ) 282 parser.add_argument("-i", "--build", dest="build") 283 parser.add_argument( 284 "-w", "--web", dest="web", default=None, 285 help="Address of a webserver to receive suite requests.") 286 parser.add_argument( 287 '--cheets_build', dest='cheets_build', default=None, 288 help='ChromeOS Android build to be installed on dut.') 289 parser.add_argument( 290 '--firmware_rw_build', dest='firmware_rw_build', default=None, 291 help='Firmware build to be installed in dut RW firmware.') 292 parser.add_argument( 293 '--firmware_ro_build', dest='firmware_ro_build', default=None, 294 help='Firmware build to be installed in dut RO firmware.') 295 parser.add_argument( 296 '--test_source_build', dest='test_source_build', default=None, 297 help=('Build that contains the test code, ' 298 'e.g., it can be the value of `--build`, ' 299 '`--firmware_rw_build` or `--firmware_ro_build` ' 300 'arguments. Default is None, that is, use the test ' 301 'code from `--build` (CrOS image)')) 302 # This should just be a boolean flag, but the autotest "proxy" code 303 # can't handle flags that don't take arguments. 304 parser.add_argument( 305 "-n", "--no_wait", dest="no_wait", default=False, type=bool_str, 306 help='Must pass "True" or "False" if used.') 307 # If you really want no pool, --pool="" will do it. USE WITH CARE. 308 parser.add_argument("-p", "--pool", dest="pool", default="suites") 309 parser.add_argument("-s", "--suite_name", dest="name") 310 parser.add_argument("-a", "--afe_timeout_mins", type=int, 311 dest="afe_timeout_mins", default=30) 312 parser.add_argument("-t", "--timeout_mins", type=int, 313 dest="timeout_mins", default=1440) 314 parser.add_argument("-x", "--max_runtime_mins", type=int, 315 dest="max_runtime_mins", default=1440) 316 parser.add_argument("-d", "--delay_sec", type=int, 317 dest="delay_sec", default=10) 318 parser.add_argument("-m", "--mock_job_id", dest="mock_job_id", 319 help="Attach to existing job id for already running " 320 "suite, and creates report.") 321 # NOTE(akeshet): This looks similar to --no_wait, but behaves differently. 322 # --no_wait is passed in to the suite rpc itself and affects the suite, 323 # while this does not. 324 parser.add_argument("-c", "--create_and_return", dest="create_and_return", 325 action="store_true", 326 help="Create the suite and print the job id, then " 327 "finish immediately.") 328 parser.add_argument("-u", "--num", dest="num", type=int, default=None, 329 help="Deprecated, does nothing.") 330 # Same boolean flag issue applies here. 331 parser.add_argument( 332 "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str, 333 help=('File bugs on test failures. Must pass "True" or ' 334 '"False" if used.')) 335 parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus", 336 action="store_true", help='Bypass lab status check.') 337 # We allow either a number or a string for the priority. This way, if you 338 # know what you're doing, one can specify a custom priority level between 339 # other levels. 340 parser.add_argument("-r", "--priority", dest="priority", 341 type=_get_priority_value, 342 default=priorities.Priority.DEFAULT, 343 action="store", 344 help="Priority of suite. Either numerical value, or " 345 "one of (" + ", ".join(priorities.Priority.names) 346 + ").") 347 parser.add_argument( 348 '--retry', dest='retry', default=False, type=bool_str, action='store', 349 help='Enable test retry. Must pass "True" or "False" if used.') 350 parser.add_argument('--max_retries', dest='max_retries', default=None, 351 type=int, action='store', help='Maximum retries' 352 'allowed at suite level. No limit if not specified.') 353 parser.add_argument('--minimum_duts', dest='minimum_duts', type=int, 354 default=0, action='store', 355 help='Check that the pool has at least such many ' 356 'healthy machines, otherwise suite will not run. ' 357 'Default to 0.') 358 parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int, 359 default=0, action='store', 360 help='Preferred minimum number of machines. Scheduler ' 361 'will prioritize on getting such many machines for ' 362 'the suite when it is competing with another suite ' 363 'that has a higher priority but already got minimum ' 364 'machines it needs. Default to 0.') 365 parser.add_argument("--suite_args", dest="suite_args", 366 type=ast.literal_eval, 367 default=None, action="store", 368 help="A dict of args passed to the suite control file.") 369 parser.add_argument('--offload_failures_only', 370 dest='offload_failures_only', type=bool_str, 371 action='store', default=False, 372 help='Only enable gs_offloading for failed tests. ' 373 'Successful tests will be deleted. Must pass "True"' 374 ' or "False" if used.') 375 parser.add_argument('--use_suite_attr', dest='use_suite_attr', 376 action='store_true', default=False, 377 help='Advanced. Run the suite based on ATTRIBUTES of ' 378 'control files, rather than SUITE.') 379 parser.add_argument('--json_dump', dest='json_dump', action='store_true', 380 default=False, 381 help='Dump the output of run_suite to stdout.') 382 parser.add_argument( 383 '--run_prod_code', dest='run_prod_code', 384 action='store_true', default=False, 385 help='Run the test code that lives in prod aka the test ' 386 'code currently on the lab servers.') 387 parser.add_argument( 388 '--delay_minutes', type=int, default=0, 389 help=('Delay the creation of test jobs for a given ' 390 'number of minutes. This argument can be used to ' 391 'force provision jobs being delayed, which helps ' 392 'to distribute loads across devservers.')) 393 parser.add_argument( 394 '--skip_duts_check', dest='skip_duts_check', action='store_true', 395 default=False, help='If True, skip minimum available DUTs check') 396 parser.add_argument( 397 '--job_keyvals', dest='job_keyvals', type=ast.literal_eval, 398 action='store', default=None, 399 help='A dict of job keyvals to be inject to suite control file') 400 parser.add_argument( 401 '--test_args', dest='test_args', type=ast.literal_eval, 402 action='store', default=None, 403 help=('A dict of args passed all the way to each individual test that ' 404 'will be actually ran.')) 405 parser.add_argument( 406 '--require_logfile', action='store_true', 407 help=('Stream logs of run_suite.py to a local file named ' 408 'run_suite-<build name>.log.')) 409 410 # Used for monitoring purposes, to measure no-op swarming proxy latency. 411 parser.add_argument('--do_nothing', action='store_true', 412 help=argparse.SUPPRESS) 413 414 # Used when lab/job status checking is needed. Currently its only user is 415 # suite scheduler v2. 416 parser.add_argument( 417 '--pre_check', action='store_true', 418 help=('Check lab and job status before kicking off a suite. Used by ' 419 'suite scheduler v2.')) 420 421 # TODO(crbug.com/763207): This is to support calling old moblab RPC 422 # with ToT code. This does not need to be supported after M62. 423 parser.add_argument('--oldrpc', action='store_true', 424 help='Use old AFE RPC.') 425 426 return parser 427 428 429def verify_and_clean_options(options): 430 """Verify the validity of options. 431 432 @param options: The parsed options to verify. 433 434 @returns: True if verification passes, False otherwise. 435 436 """ 437 if options.mock_job_id and ( 438 not options.build or not options.name or not options.board): 439 print ('When using -m, need to specify build, board and suite ' 440 'name which you have used for creating the original job') 441 return False 442 else: 443 if not options.build: 444 print 'Need to specify which build to use' 445 return False 446 if not options.board: 447 print 'Need to specify board' 448 return False 449 if not options.name: 450 print 'Need to specify suite name' 451 return False 452 if options.num is not None: 453 warnings.warn('-u/--num option is deprecated; it does nothing.') 454 del options.num 455 if not options.retry and options.max_retries is not None: 456 print 'max_retries can only be used with --retry=True' 457 return False 458 if options.use_suite_attr and options.suite_args is not None: 459 print ('The new suite control file cannot parse the suite_args: %s.' 460 'Please not specify any suite_args here.' % options.suite_args) 461 return False 462 if options.no_wait and options.retry: 463 print 'Test retry is not available when using --no_wait=True' 464 # Default to use the test code in CrOS build. 465 if not options.test_source_build and options.build: 466 options.test_source_build = options.build 467 options.child_dependencies = _make_child_dependencies(options) 468 base_dependencies = ('board:%s' % options.board, 469 'pool:%s' % options.pool) 470 options.dependencies = base_dependencies + options.child_dependencies 471 return True 472 473 474def change_options_for_suite_attr(options): 475 """Change options to be prepared to run the suite_attr_wrapper. 476 477 If specify 'use_suite_attr' from the cmd line, it indicates to run the 478 new style suite control file, suite_attr_wrapper. Then, change the 479 options.name to 'suite_attr_wrapper', change the options.suite_args to 480 include the arguments needed by suite_attr_wrapper. 481 482 @param options: The verified options. 483 484 @returns: The changed options. 485 486 """ 487 # Convert the suite_name to attribute boolean expression. 488 if type(options.name) is str: 489 attr_filter_val = 'suite:%s' % options.name 490 else: 491 attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name]) 492 493 # change the suite_args to be a dict of arguments for suite_attr_wrapper 494 # if suite_args is not None, store the values in 'other_args' of the dict 495 args_dict = {} 496 args_dict['attr_filter'] = attr_filter_val 497 options.suite_args = args_dict 498 options.name = 'suite_attr_wrapper' 499 500 return options 501 502 503class TestResult(object): 504 505 """Represents the result of a TestView.""" 506 507 def __init__(self, test_view, retry_count=0): 508 """Initialize instance. 509 510 @param test_view: TestView instance. 511 @param retry_count: Retry count for test. Optional. 512 """ 513 self.name = test_view.get_testname() 514 self.status = test_view['status'] 515 self.reason = test_view['reason'] 516 self.retry_count = retry_count 517 518 _PRETTY_STATUS_MAP = { 519 'GOOD': '[ PASSED ]', 520 'TEST_NA': '[ INFO ]', 521 } 522 523 @property 524 def _pretty_status(self): 525 """Pretty status string.""" 526 return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]') 527 528 def log_using(self, log_function, name_column_width): 529 """Log the test result using the given log function. 530 531 @param log_function: Log function to use. Example: logging.info 532 @param name_column_width: Width of name column for formatting. 533 """ 534 padded_name = self.name.ljust(name_column_width) 535 log_function('%s%s', padded_name, self._pretty_status) 536 if self.status != 'GOOD': 537 log_function('%s %s: %s', padded_name, self.status, self.reason) 538 if self.retry_count > 0: 539 log_function('%s retry_count: %s', padded_name, self.retry_count) 540 541 542def get_original_suite_name(suite_name, suite_args): 543 """Get the original suite name when running suite_attr_wrapper. 544 545 @param suite_name: the name of the suite launched in afe. When it is 546 suite_attr_wrapper, the suite that actually running is 547 specified in the suite_args. 548 @param suite_args: dict of suite args from argument parsing. 549 550 @returns: the original suite name. 551 552 """ 553 if suite_name == 'suite_attr_wrapper': 554 attrs = suite_args.get('attr_filter', '') 555 suite_list = ([x[6:] for x in re.split('[() ]', attrs) 556 if x and x.startswith('suite:')]) 557 return suite_list[0] if suite_list else suite_name 558 return suite_name 559 560 561class LogLink(object): 562 """Information needed to record a link in the logs. 563 564 Depending on context and the information provided at 565 construction time, the link may point to either to log files for 566 a job, or to a bug filed for a failure in the job. 567 568 @var anchor The link text. 569 @var url The link url. 570 @var bug_id Id of a bug to link to, or None. 571 """ 572 573 # A list of tests that don't get retried so skip the dashboard. 574 _SKIP_RETRY_DASHBOARD = ['provision'] 575 576 _BUG_LINK_PREFIX = 'Auto-Bug' 577 _LOG_LINK_PREFIX = 'Test-Logs' 578 579 580 def __init__(self, anchor, server, job_string, bug_info=None, reason=None, 581 retry_count=0, testname=None, sponge_url=None): 582 """Initialize the LogLink by generating the log URL. 583 584 @param anchor The link text. 585 @param server The hostname of the server this suite ran on. 586 @param job_string The job whose logs we'd like to link to. 587 @param bug_info Info about the bug, if one was filed. 588 @param reason A string representing the reason of failure if any. 589 @param retry_count How many times the test has been retried. 590 @param testname Optional Arg that supplies the testname. 591 @param sponge_url url to Sponge result. 592 """ 593 self.anchor = anchor 594 self.url = _URL_PATTERN % (rpc_client_lib.add_protocol(server), 595 job_string) 596 self.reason = reason 597 self.retry_count = retry_count 598 self.testname = testname 599 self.sponge_url = sponge_url 600 if bug_info: 601 self.bug_id, self.bug_count = bug_info 602 else: 603 self.bug_id = None 604 self.bug_count = None 605 606 607 @property 608 def bug_url(self): 609 """URL of associated bug.""" 610 if self.bug_id: 611 return reporting_utils.link_crbug(self.bug_id) 612 else: 613 return None 614 615 616 @property 617 def _bug_count_text(self): 618 """Return bug count as human friendly text.""" 619 if self.bug_count is None: 620 bug_info = 'unknown number of reports' 621 elif self.bug_count == 1: 622 bug_info = 'new report' 623 else: 624 bug_info = '%s reports' % self.bug_count 625 return bug_info 626 627 628 def GenerateBuildbotLinks(self): 629 """Generate a link formatted to meet buildbot expectations. 630 631 If there is a bug associated with this link, report a link to the bug 632 and a link to the job logs; otherwise report a link to the job logs. 633 634 @return A generator of links formatted for the buildbot log annotator. 635 """ 636 if self.bug_url: 637 yield self._get_link_to_bug() 638 yield self._get_link_to_job_logs() 639 640 641 def _get_link_to_bug(self): 642 """Return buildbot link to bug. 643 644 @return A link formatted for the buildbot log annotator. 645 """ 646 info_strings = self._get_info_strings() 647 info_strings.append(self._bug_count_text) 648 anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX, 649 info_strings) 650 return annotations.StepLink(anchor_text, self.bug_url) 651 652 653 def _get_link_to_job_logs(self): 654 """Return buildbot link to job logs. 655 656 @return A link formatted for the buildbot log annotator. 657 """ 658 anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX, 659 self._get_info_strings()) 660 return annotations.StepLink(anchor_text, self.url) 661 662 663 def _get_info_strings(self): 664 """Return a list of info strings for _format_anchor_text().""" 665 info_strings = [] 666 if self.retry_count > 0: 667 info_strings.append('retry_count: %d' % self.retry_count) 668 if self.reason: 669 info_strings.append(self.reason) 670 return info_strings 671 672 673 def _format_anchor_text(self, prefix, info_strings): 674 """Format anchor text given a prefix and info strings. 675 676 @param prefix The prefix of the anchor text. 677 @param info_strings Iterable of strings. 678 @return A anchor_text with the right prefix and info strings. 679 """ 680 return '[{prefix}]: {anchor}: {info}'.format( 681 prefix=prefix, 682 anchor=self.anchor.strip(), 683 info=', '.join(info_strings)) 684 685 @property 686 def text_link(self): 687 """Link to the job's logs, for consumption by a human. 688 689 @return A link formatted for human readability. 690 """ 691 return '%s %s' % (self.anchor, self.url) 692 693 def GenerateRetryLink(self): 694 """Generate a link to the retry dashboard. 695 696 @return A link formatted for the buildbot log annotator. 697 """ 698 if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD: 699 return None 700 701 # TODO(xixuan): Return the right flake dashboard later. 702 return None 703 704 def GenerateHistoryLink(self): 705 """Generate a link to the test history dashboard. 706 707 @return A link formatted for the buildbot log annotator. 708 """ 709 if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD: 710 return None 711 return annotations.StepLink( 712 text='[Test-History]: %s' % self.testname, 713 url=reporting_utils.link_test_history(self.testname)) 714 715 716class Timings(object): 717 """Timings for important events during a suite. 718 719 All timestamps are datetime.datetime objects. 720 721 @var suite_job_id: the afe job id of the suite job for which 722 we are recording the timing for. 723 @var download_start_time: the time the devserver starts staging 724 the build artifacts. Recorded in create_suite_job. 725 @var payload_end_time: the time when the artifacts only necessary to start 726 installsing images onto DUT's are staged. 727 Recorded in create_suite_job. 728 @var artifact_end_time: the remaining artifacts are downloaded after we kick 729 off the reimaging job, at which point we record 730 artifact_end_time. Recorded in dynamic_suite.py. 731 @var suite_start_time: the time the suite started. 732 @var tests_start_time: the time the first test started running. 733 @var tests_end_time: the time the last test finished running. 734 """ 735 736 def __init__(self, suite_job_id): 737 self.suite_job_id = suite_job_id 738 # Timings related to staging artifacts on devserver. 739 self.download_start_time = None 740 self.payload_end_time = None 741 self.artifact_end_time = None 742 743 # The test_start_time, but taken off the view that corresponds to the 744 # suite instead of an individual test. 745 self.suite_start_time = None 746 747 # Earliest and Latest tests in the set of TestViews passed to us. 748 self.tests_start_time = None 749 self.tests_end_time = None 750 751 752 def RecordTiming(self, view): 753 """Given a test report view, extract and record pertinent time info. 754 755 get_detailed_test_views() returns a list of entries that provide 756 info about the various parts of a suite run. This method can take 757 any one of these entries and look up timestamp info we might want 758 and record it. 759 760 If timestamps are unavailable, datetime.datetime.min/max will be used. 761 762 @param view: A TestView object. 763 """ 764 start_candidate = datetime.min 765 end_candidate = datetime.max 766 if view['test_started_time']: 767 start_candidate = time_utils.time_string_to_datetime( 768 view['test_started_time']) 769 if view['test_finished_time']: 770 end_candidate = time_utils.time_string_to_datetime( 771 view['test_finished_time']) 772 773 if view.get_testname() == TestView.SUITE_JOB: 774 self.suite_start_time = start_candidate 775 else: 776 self._UpdateFirstTestStartTime(start_candidate) 777 self._UpdateLastTestEndTime(end_candidate) 778 if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view: 779 keyvals = view['job_keyvals'] 780 self.download_start_time = time_utils.time_string_to_datetime( 781 keyvals.get(constants.DOWNLOAD_STARTED_TIME), 782 handle_type_error=True) 783 784 self.payload_end_time = time_utils.time_string_to_datetime( 785 keyvals.get(constants.PAYLOAD_FINISHED_TIME), 786 handle_type_error=True) 787 788 self.artifact_end_time = time_utils.time_string_to_datetime( 789 keyvals.get(constants.ARTIFACT_FINISHED_TIME), 790 handle_type_error=True) 791 792 793 def _UpdateFirstTestStartTime(self, candidate): 794 """Update self.tests_start_time, iff candidate is an earlier time. 795 796 @param candidate: a datetime.datetime object. 797 """ 798 if not self.tests_start_time or candidate < self.tests_start_time: 799 self.tests_start_time = candidate 800 801 802 def _UpdateLastTestEndTime(self, candidate): 803 """Update self.tests_end_time, iff candidate is a later time. 804 805 @param candidate: a datetime.datetime object. 806 """ 807 if not self.tests_end_time or candidate > self.tests_end_time: 808 self.tests_end_time = candidate 809 810 811 def __str__(self): 812 return ('\n' 813 'Suite timings:\n' 814 'Downloads started at %s\n' 815 'Payload downloads ended at %s\n' 816 'Suite started at %s\n' 817 'Artifact downloads ended (at latest) at %s\n' 818 'Testing started at %s\n' 819 'Testing ended at %s\n' % (self.download_start_time, 820 self.payload_end_time, 821 self.suite_start_time, 822 self.artifact_end_time, 823 self.tests_start_time, 824 self.tests_end_time)) 825 826 827def instance_for_pool(pool_name): 828 """ 829 Return the hostname of the server that should be used to service a suite 830 for the specified pool. 831 832 @param pool_name: The pool (without 'pool:' to schedule the suite against. 833 @return: The correct host that should be used to service this suite run. 834 """ 835 return CONFIG.get_config_value( 836 'POOL_INSTANCE_SHARDING', pool_name, 837 default=_DEFAULT_AUTOTEST_INSTANCE) 838 839 840class TestView(object): 841 """Represents a test view and provides a set of helper functions.""" 842 843 844 SUITE_JOB = 'Suite job' 845 846 847 def __init__(self, view, afe_job, suite_name, build, user, 848 solo_test_run=False): 849 """Init a TestView object representing a tko test view. 850 851 @param view: A dictionary representing a tko test view. 852 @param afe_job: An instance of frontend.afe.models.Job 853 representing the job that kicked off the test. 854 @param suite_name: The name of the suite 855 that the test belongs to. 856 @param build: The build for which the test is run. 857 @param user: The user for which the test is run. 858 @param solo_test_run: This is a solo test run not part of a suite. 859 """ 860 self.view = view 861 self.afe_job = afe_job 862 self.suite_name = suite_name 863 self.build = build 864 self.is_suite_view = afe_job.parent_job is None and not solo_test_run 865 # This is the test name that will be shown in the output. 866 self.testname = None 867 self.user = user 868 869 # The case that a job was aborted before it got a chance to run 870 # usually indicates suite has timed out (unless aborted by user). 871 # In this case, the abort reason will be None. 872 # Update the reason with proper information. 873 if (self.is_relevant_suite_view() and 874 not self.get_testname() == self.SUITE_JOB and 875 self.view['status'] == 'ABORT' and 876 not self.view['reason']): 877 self.view['reason'] = 'Timed out, did not run.' 878 879 880 def __getitem__(self, key): 881 """Overload __getitem__ so that we can still use [] 882 883 @param key: A key of the tko test view. 884 885 @returns: The value of an attribute in the view. 886 887 """ 888 return self.view[key] 889 890 891 def __iter__(self): 892 """Overload __iter__ so that it supports 'in' operator.""" 893 return iter(self.view) 894 895 896 def get_testname(self): 897 """Get test name that should be shown in the output. 898 899 Formalize the test_name we got from the test view. 900 901 Remove 'build/suite' prefix if any. 902 903 If one runs a test in control file via the following code, 904 job.runtest('my_Test', tag='tag') 905 for most of the cases, view['test_name'] would look like 'my_Test.tag'. 906 If this is the case, this method will just return the original 907 test name, i.e. 'my_Test.tag'. 908 909 There are four special cases. 910 1) A test view is for the suite job's SERVER_JOB. 911 In this case, this method will return 'Suite job'. 912 913 2) A test view is of a child job or a solo test run not part of a 914 suite, and for a SERVER_JOB or CLIENT_JOB. 915 In this case, we will take the job name, remove the build/suite 916 prefix from the job name, and append the rest to 'SERVER_JOB' 917 or 'CLIENT_JOB' as a prefix. So the names returned by this 918 method will look like: 919 'dummy_Pass_SERVER_JOB' 920 'dummy_Fail_SERVER_JOB' 921 922 3) A test view is of a suite job and its status is ABORT. 923 In this case, the view['test_name'] is the child job's name. 924 For instance, 925 'lumpy-release/R35-5712.0.0/dummy/dummy_Pass' 926 'lumpy-release/R35-5712.0.0/dummy/dummy_Fail' 927 The above names will be converted to the following: 928 'dummy_Pass' 929 'dummy_Fail' 930 931 4) A test view's status is of a suite job and its status is TEST_NA. 932 In this case, the view['test_name'] is the NAME field of the control 933 file. For instance, 934 'dummy_Pass' 935 'dummy_Fail' 936 This method will not modify these names. 937 938 @returns: Test name after normalization. 939 940 """ 941 if self.testname is not None: 942 return self.testname 943 944 if (self.is_suite_view and 945 self.view['test_name'].startswith('SERVER_JOB')): 946 # Rename suite job's SERVER_JOB to 'Suite job'. 947 self.testname = self.SUITE_JOB 948 return self.testname 949 950 if (self.view['test_name'].startswith('SERVER_JOB') or 951 self.view['test_name'].startswith('CLIENT_JOB')): 952 # Append job name as a prefix for SERVER_JOB and CLIENT_JOB 953 testname= '%s_%s' % (self.view['job_name'], self.view['test_name']) 954 else: 955 testname = self.view['test_name'] 956 # Remove the build and suite name from testname if any. 957 self.testname = tools.get_test_name( 958 self.build, self.suite_name, testname) 959 return self.testname 960 961 962 def is_relevant_suite_view(self): 963 """Checks whether this is a suite view we should care about. 964 965 @returns: True if it is relevant. False otherwise. 966 """ 967 return (self.get_testname() == self.SUITE_JOB or 968 (self.is_suite_view and 969 not self.view['test_name'].startswith('CLIENT_JOB') and 970 not self.view['subdir'])) 971 972 973 def is_test(self): 974 """Return whether the view is for an actual test. 975 976 @returns True if the view is for an actual test. 977 False if the view is for SERVER_JOB or CLIENT_JOB. 978 979 """ 980 return not (self.view['test_name'].startswith('SERVER_JOB') or 981 self.view['test_name'].startswith('CLIENT_JOB')) 982 983 984 def is_retry(self): 985 """Check whether the view is for a retry. 986 987 @returns: True, if the view is for a retry; False otherwise. 988 989 """ 990 return self.view['job_keyvals'].get('retry_original_job_id') is not None 991 992 993 def hit_timeout(self): 994 """Check whether the corresponding job has hit its own timeout. 995 996 Note this method should not be called for those test views 997 that belongs to a suite job and are determined as irrelevant 998 by is_relevant_suite_view. This is because they are associated 999 to the suite job, whose job start/finished time make no sense 1000 to an irrelevant test view. 1001 1002 @returns: True if the corresponding afe job has hit timeout. 1003 False otherwise. 1004 """ 1005 if (self.is_relevant_suite_view() and 1006 self.get_testname() != self.SUITE_JOB): 1007 # Any relevant suite test view except SUITE_JOB 1008 # did not hit its own timeout because it was not ever run. 1009 return False 1010 start = (datetime.strptime( 1011 self.view['job_started_time'], time_utils.TIME_FMT) 1012 if self.view['job_started_time'] else None) 1013 end = (datetime.strptime( 1014 self.view['job_finished_time'], time_utils.TIME_FMT) 1015 if self.view['job_finished_time'] else None) 1016 if not start or not end: 1017 return False 1018 else: 1019 return ((end - start).total_seconds()/60.0 1020 > self.afe_job.max_runtime_mins) 1021 1022 1023 def is_aborted(self): 1024 """Check if the view was aborted. 1025 1026 For suite job and child job test views, we check job keyval 1027 'aborted_by' and test status. 1028 1029 For relevant suite job test views, we only check test status 1030 because the suite job keyval won't make sense to individual 1031 test views. 1032 1033 @returns: True if the test was as aborted, False otherwise. 1034 1035 """ 1036 1037 if (self.is_relevant_suite_view() and 1038 self.get_testname() != self.SUITE_JOB): 1039 return self.view['status'] == 'ABORT' 1040 else: 1041 return (bool(self.view['job_keyvals'].get('aborted_by')) and 1042 self.view['status'] in ['ABORT', 'RUNNING']) 1043 1044 1045 def is_in_fail_status(self): 1046 """Check if the given test's status corresponds to a failure. 1047 1048 @returns: True if the test's status is FAIL or ERROR. False otherwise. 1049 1050 """ 1051 # All the statuses tests can have when they fail. 1052 return self.view['status'] in ['FAIL', 'ERROR', 'ABORT'] 1053 1054 1055 def is_provision(self): 1056 """Check whether this is a provision test.""" 1057 return self.get_testname() == 'provision' 1058 1059 1060 def get_buildbot_link_reason(self): 1061 """Generate the buildbot link reason for the test. 1062 1063 @returns: A string representing the reason. 1064 1065 """ 1066 return ('%s: %s' % (self.view['status'], self.view['reason']) 1067 if self.view['reason'] else self.view['status']) 1068 1069 1070 def get_job_id_owner_str(self): 1071 """Generate the job_id_owner string for a test. 1072 1073 @returns: A string which looks like 135036-username 1074 1075 """ 1076 return '%s-%s' % (self.view['afe_job_id'], self.user) 1077 1078 1079 def get_bug_info(self, suite_job_keyvals): 1080 """Get the bug info from suite_job_keyvals. 1081 1082 If a bug has been filed for the test, its bug info (bug id and counts) 1083 will be stored in the suite job's keyvals. This method attempts to 1084 retrieve bug info of the test from |suite_job_keyvals|. It will return 1085 None if no bug info is found. No need to check bug info if the view is 1086 SUITE_JOB. 1087 1088 @param suite_job_keyvals: The job keyval dictionary of the suite job. 1089 All the bug info about child jobs are stored in 1090 suite job's keyvals. 1091 1092 @returns: None if there is no bug info, or a pair with the 1093 id of the bug, and the count of the number of 1094 times the bug has been seen. 1095 1096 """ 1097 if self.get_testname() == self.SUITE_JOB: 1098 return None 1099 if (self.view['test_name'].startswith('SERVER_JOB') or 1100 self.view['test_name'].startswith('CLIENT_JOB')): 1101 # Append job name as a prefix for SERVER_JOB and CLIENT_JOB 1102 testname= '%s_%s' % (self.view['job_name'], self.view['test_name']) 1103 else: 1104 testname = self.view['test_name'] 1105 1106 return tools.get_test_failure_bug_info( 1107 suite_job_keyvals, self.view['afe_job_id'], 1108 testname) 1109 1110 1111 def should_display_buildbot_link(self): 1112 """Check whether a buildbot link should show for this view. 1113 1114 For suite job view, show buildbot link if it fails. 1115 For normal test view, 1116 show buildbot link if it is a retry 1117 show buildbot link if it hits its own timeout. 1118 show buildbot link if it fails. This doesn't 1119 include the case where it was aborted but has 1120 not hit its own timeout (most likely it was aborted because 1121 suite has timed out). 1122 1123 @returns: True if we should show the buildbot link. 1124 False otherwise. 1125 """ 1126 is_bad_status = (self.view['status'] != 'GOOD' and 1127 self.view['status'] != 'TEST_NA') 1128 if self.get_testname() == self.SUITE_JOB: 1129 return is_bad_status 1130 else: 1131 if self.is_retry(): 1132 return True 1133 if is_bad_status: 1134 return not self.is_aborted() or self.hit_timeout() 1135 1136 1137 def get_control_file_attributes(self): 1138 """Get the attributes from the control file of the test. 1139 1140 @returns: A list of test attribute or None. 1141 """ 1142 control_file = self.afe_job.control_file 1143 attributes = None 1144 if control_file: 1145 cd = control_data.parse_control_string(control_file) 1146 attributes = list(cd.attributes) 1147 return attributes 1148 1149 1150 def override_afe_job_id(self, afe_job_id): 1151 """Overrides the AFE job id for the test. 1152 1153 @param afe_job_id: The new AFE job id to use. 1154 """ 1155 self.view['afe_job_id'] = afe_job_id 1156 1157 1158def log_buildbot_links(log_func, links): 1159 """Output buildbot links to log. 1160 1161 @param log_func: Logging function to use. 1162 @param links: Iterable of LogLink instances. 1163 """ 1164 for link in links: 1165 for generated_link in link.GenerateBuildbotLinks(): 1166 log_func(generated_link) 1167 retry_link = link.GenerateRetryLink() 1168 if retry_link: 1169 log_func(retry_link) 1170 history_link = link.GenerateHistoryLink() 1171 if history_link: 1172 log_func(history_link) 1173 1174 1175class _ReturnCodeComputer(object): 1176 """This is responsible for returning the _ReturnResult for a suite.""" 1177 1178 def __call__(self, test_views): 1179 """Compute the exit code based on test results.""" 1180 result = _RETURN_RESULTS['ok'] 1181 1182 for v in test_views: 1183 if v.get_testname() == TestView.SUITE_JOB: 1184 result |= self._get_suite_result(v) 1185 else: 1186 result |= self._get_test_result(v) 1187 return result 1188 1189 def _get_suite_result(self, test_view): 1190 """Return the _ReturnResult for the given suite job.""" 1191 # The order of checking each case is important. 1192 if test_view.is_aborted() and test_view.hit_timeout(): 1193 return _RETURN_RESULTS['suite_timeout'] 1194 elif test_view.is_in_fail_status(): 1195 return _RETURN_RESULTS['suite_failed'] 1196 elif test_view['status'] == 'WARN': 1197 return _RETURN_RESULTS['suite_warning'] 1198 else: 1199 return _RETURN_RESULTS['ok'] 1200 1201 def _get_test_result(self, test_view): 1202 """Return the _ReturnResult for the given test job.""" 1203 # The order of checking each case is important. 1204 if test_view.is_aborted() and test_view.is_relevant_suite_view(): 1205 # The test was aborted before started 1206 # This gurantees that the suite has timed out. 1207 return _RETURN_RESULTS['test_aborted_prestart'] 1208 elif test_view.is_aborted() and not test_view.hit_timeout(): 1209 # The test was aborted, but 1210 # not due to a timeout. This is most likely 1211 # because the suite has timed out, but may 1212 # also because it was aborted by the user. 1213 # Since suite timing out is determined by checking 1214 # the suite job view, we simply ignore this view here. 1215 return _RETURN_RESULTS['test_aborted_mystery'] 1216 elif test_view.is_in_fail_status(): # The test job failed 1217 if test_view.is_provision(): 1218 return _RETURN_RESULTS['provision_failed'] 1219 else: 1220 return _RETURN_RESULTS['test_failure'] 1221 elif test_view['status'] == 'WARN': 1222 return _RETURN_RESULTS['test_warning'] 1223 elif test_view.is_retry(): 1224 # The test is a passing retry. 1225 return _RETURN_RESULTS['test_retry'] 1226 else: 1227 return _RETURN_RESULTS['ok'] 1228 1229 1230class _ProvisionReturnCodeComputer(_ReturnCodeComputer): 1231 """This is used for returning the _ReturnResult for provision suites.""" 1232 1233 def __init__(self, num_required): 1234 """Initialize instance. 1235 1236 num_required is the number of passing provision jobs needed. 1237 """ 1238 super(_ProvisionReturnCodeComputer, self).__init__() 1239 self._num_required = num_required 1240 self._num_successful = 0 1241 1242 def __call__(self, test_views): 1243 result = super(_ProvisionReturnCodeComputer, self).__call__(test_views) 1244 if self._num_successful >= self._num_required: 1245 logging.info('Return result upgraded from %r' 1246 ' due to enough ok provisions', 1247 result) 1248 return _RETURN_RESULTS['ok'] 1249 else: 1250 return result 1251 1252 def _get_test_result(self, test_view): 1253 result = (super(_ProvisionReturnCodeComputer, self) 1254 ._get_test_result(test_view)) 1255 if result in {_RETURN_RESULTS[s] for s in ('ok', 'test_retry')}: 1256 self._num_successful += 1 1257 return result 1258 1259 1260class ResultCollector(object): 1261 """Collect test results of a suite or a single test run. 1262 1263 Once a suite job has finished, use this class to collect test results. 1264 `run` is the core method that is to be called first. Then the caller 1265 could retrieve information like return code, return message, is_aborted, 1266 and timings by accessing the collector's public attributes. And output 1267 the test results and links by calling the 'output_*' methods. 1268 1269 Here is a overview of what `run` method does. 1270 1271 1) Collect the suite job's results from tko_test_view_2. 1272 For the suite job, we only pull test views without a 'subdir'. 1273 A NULL subdir indicates that the test was _not_ executed. This could be 1274 that no child job was scheduled for this test or the child job got 1275 aborted before starts running. 1276 (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially) 1277 1278 2) Collect the child jobs' results from tko_test_view_2. 1279 For child jobs, we pull all the test views associated with them. 1280 (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially) 1281 1282 3) Generate web and buildbot links. 1283 4) Compute timings of the suite run. 1284 5) Compute the return code based on test results. 1285 1286 @var _instance_server: The hostname of the server that is used 1287 to service the suite. 1288 @var _afe: The afe rpc client. 1289 @var _tko: The tko rpc client. 1290 @var _build: The build for which the suite is run, 1291 e.g. 'lumpy-release/R35-5712.0.0' 1292 @var _suite_name: The suite name, e.g. 'bvt', 'dummy'. 1293 @var _suite_job_id: The job id of the suite for which we are going to 1294 collect results. 1295 @var _original_suite_name: The suite name we record timing would be 1296 different from _suite_name when running 1297 suite_attr_wrapper. 1298 @var _return_code_function: Called to return what the overall result of 1299 the suite is. 1300 @var _suite_views: A list of TestView objects, representing relevant 1301 test views of the suite job. 1302 @var _child_views: A list of TestView objects, representing test views 1303 of the child jobs. 1304 @var _test_views: A list of TestView objects, representing all test views 1305 from _suite_views and _child_views. 1306 @var _web_links: A list of web links pointing to the results of jobs. 1307 @var buildbot_links: A list of buildbot links for non-passing tests. 1308 @var _solo_test_run: True if this is a single test run. 1309 @var return_result: The _ReturnResult of the suite run. 1310 @var is_aborted: Whether the suite was aborted or not. 1311 True, False or None (aborting status is unknown yet) 1312 @var timings: A Timing object that records the suite's timings. 1313 1314 """ 1315 1316 1317 def __init__(self, instance_server, afe, tko, build, 1318 suite_name, suite_job_id, return_code_function, 1319 original_suite_name=None, 1320 user=None, solo_test_run=False): 1321 self._instance_server = instance_server 1322 self._afe = afe 1323 self._tko = tko 1324 self._build = build 1325 self._suite_name = suite_name 1326 self._suite_job_id = suite_job_id 1327 self._original_suite_name = original_suite_name or suite_name 1328 self._return_code_function = return_code_function 1329 self._suite_views = [] 1330 self._child_views = [] 1331 self._test_views = [] 1332 self._retry_counts = {} 1333 self._missing_results = {} 1334 self._web_links = [] 1335 self.buildbot_links = [] 1336 self._num_child_jobs = 0 1337 self.return_result = None 1338 self.is_aborted = None 1339 self.timings = None 1340 self._user = user or getpass.getuser() 1341 self._solo_test_run = solo_test_run 1342 1343 1344 def _fetch_relevant_test_views_of_suite(self): 1345 """Fetch relevant test views of the suite job. 1346 1347 For the suite job, there will be a test view for SERVER_JOB, and views 1348 for results of its child jobs. For example, assume we've created 1349 a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail, 1350 dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while 1351 dummy_Path.bluetooth got TEST_NA as no duts have bluetooth. 1352 So the suite job's test views would look like 1353 _____________________________________________________________________ 1354 test_idx| job_idx|test_name |subdir |afe_job_id|status 1355 10 | 1000 |SERVER_JOB |---- |40 |GOOD 1356 11 | 1000 |dummy_Pass |NULL |40 |ABORT 1357 12 | 1000 |dummy_Fail.Fail |41-onwer/...|40 |FAIL 1358 13 | 1000 |dummy_Fail.Error |42-owner/...|40 |ERROR 1359 14 | 1000 |dummy_Pass.bluetooth|NULL |40 |TEST_NA 1360 1361 For a suite job, we only care about 1362 a) The test view for the suite job's SERVER_JOB 1363 b) The test views for real tests without a subdir. A NULL subdir 1364 indicates that a test didn't get executed. 1365 So, for the above example, we only keep test views whose test_idxs 1366 are 10, 11, 14. 1367 1368 @returns: A list of TestView objects, representing relevant 1369 test views of the suite job. 1370 1371 """ 1372 suite_job = self._afe.get_jobs(id=self._suite_job_id)[0] 1373 views = self._tko.run(call='get_detailed_test_views', 1374 afe_job_id=self._suite_job_id) 1375 relevant_views = [] 1376 for v in views: 1377 v = TestView(v, suite_job, self._suite_name, self._build, self._user, 1378 solo_test_run=self._solo_test_run) 1379 if v.is_relevant_suite_view(): 1380 # If the test doesn't have results in TKO and is being 1381 # displayed in the suite view instead of the child view, 1382 # then afe_job_id is incorrect and from the suite. 1383 # Override it based on the AFE job id which was missing 1384 # results. 1385 # TODO: This is likely inaccurate if a test has multiple 1386 # tries which all fail TKO parse stage. 1387 if v['test_name'] in self._missing_results: 1388 v.override_afe_job_id( 1389 self._missing_results[v['test_name']][0]) 1390 relevant_views.append(v) 1391 return relevant_views 1392 1393 1394 def _compute_retry_count(self, view): 1395 """Return how many times the test has been retried. 1396 1397 @param view: A TestView instance. 1398 @returns: An int value indicating the retry count. 1399 1400 """ 1401 old_job = view['job_keyvals'].get('retry_original_job_id') 1402 count = 0 1403 while old_job: 1404 count += 1 1405 views = self._tko.run( 1406 call='get_detailed_test_views', afe_job_id=old_job) 1407 old_job = (views[0]['job_keyvals'].get('retry_original_job_id') 1408 if views else None) 1409 return count 1410 1411 1412 def _fetch_test_views_of_child_jobs(self, jobs=None): 1413 """Fetch test views of child jobs. 1414 1415 @returns: A tuple (child_views, retry_counts, missing_results) 1416 child_views is list of TestView objects, representing 1417 all valid views. 1418 retry_counts is a dictionary that maps test_idx to retry 1419 counts. It only stores retry counts that are greater than 0. 1420 missing_results is a dictionary that maps test names to 1421 lists of job ids. 1422 1423 """ 1424 child_views = [] 1425 retry_counts = {} 1426 missing_results = {} 1427 child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id) 1428 if child_jobs: 1429 self._num_child_jobs = len(child_jobs) 1430 for job in child_jobs: 1431 views = [TestView(v, job, self._suite_name, self._build, self._user) 1432 for v in self._tko.run( 1433 call='get_detailed_test_views', afe_job_id=job.id, 1434 invalid=0)] 1435 if len(views) == 0: 1436 missing_results.setdefault(job.name, []).append(job.id) 1437 contains_test_failure = any( 1438 v.is_test() and v['status'] != 'GOOD' for v in views) 1439 for v in views: 1440 if (v.is_test() or 1441 v['status'] != 'GOOD' and not contains_test_failure): 1442 # For normal test view, just keep it. 1443 # For SERVER_JOB or CLIENT_JOB, only keep it 1444 # if it fails and no other test failure. 1445 child_views.append(v) 1446 retry_count = self._compute_retry_count(v) 1447 if retry_count > 0: 1448 retry_counts[v['test_idx']] = retry_count 1449 return child_views, retry_counts, missing_results 1450 1451 1452 def _generate_web_and_buildbot_links(self): 1453 """Generate web links and buildbot links.""" 1454 # TODO(fdeng): If a job was aborted before it reaches Running 1455 # state, we read the test view from the suite job 1456 # and thus this method generates a link pointing to the 1457 # suite job's page for the aborted job. Need a fix. 1458 self._web_links = [] 1459 self.buildbot_links = [] 1460 1461 # Bug info are stored in the suite job's keyvals. 1462 if self._solo_test_run: 1463 suite_job_keyvals = {} 1464 elif not self._suite_views: 1465 suite_job_keyvals = {} 1466 else: 1467 suite_job_keyvals = self._suite_views[0]['job_keyvals'] 1468 1469 for v in self._test_views: 1470 retry_count = self._retry_counts.get(v['test_idx'], 0) 1471 bug_info = v.get_bug_info(suite_job_keyvals) 1472 job_id_owner = v.get_job_id_owner_str() 1473 link = LogLink( 1474 anchor=v.get_testname(), 1475 server=self._instance_server, 1476 job_string=job_id_owner, 1477 bug_info=bug_info, retry_count=retry_count, 1478 testname=v.get_testname(), 1479 sponge_url=suite_job_keyvals.get('sponge_url')) 1480 self._web_links.append(link) 1481 1482 if v.should_display_buildbot_link(): 1483 link.reason = v.get_buildbot_link_reason() 1484 self.buildbot_links.append(link) 1485 1486 1487 def _record_timings(self): 1488 """Record suite timings.""" 1489 self.timings = Timings(self._suite_job_id) 1490 for v in self._test_views: 1491 self.timings.RecordTiming(v) 1492 1493 1494 def _compute_return_code(self): 1495 """Compute the exit code based on test results.""" 1496 self.return_result = self._return_code_function(self._test_views) 1497 1498 1499 def _make_test_results(self): 1500 """Make TestResults for collected tests. 1501 1502 @returns: List of TestResult instances. 1503 """ 1504 test_results = [] 1505 for test_view in self._test_views: 1506 test_result = TestResult( 1507 test_view=test_view, 1508 retry_count=self._retry_counts.get(test_view['test_idx'], 0)) 1509 test_results.append(test_result) 1510 return test_results 1511 1512 1513 def output_results(self): 1514 """Output test results, timings and web links.""" 1515 # Output test results 1516 test_results = self._make_test_results() 1517 if len(test_results) == 0: 1518 max_name_length = 0 1519 else: 1520 max_name_length = max(len(t.name) for t in test_results) 1521 for test_result in test_results: 1522 test_result.log_using(logging.info, max_name_length + 3) 1523 # Output suite timings 1524 logging.info(self.timings) 1525 # Output links to test logs 1526 logging.info('\nLinks to test logs:') 1527 for link in self._web_links: 1528 logging.info(link.text_link) 1529 logging.info('\n') 1530 1531 1532 def get_results_dict(self): 1533 """Write test results, timings and web links into a dict. 1534 1535 @returns: A dict of results in the format like: 1536 { 1537 'tests': { 1538 'test_1': {'status': 'PASSED', 'attributes': [1,2], ...} 1539 'test_2': {'status': 'FAILED', 'attributes': [1],...} 1540 } 1541 'suite_timings': { 1542 'download_start': '1998-07-17 00:00:00', 1543 'payload_download_end': '1998-07-17 00:00:05', 1544 ... 1545 } 1546 } 1547 """ 1548 output_dict = {} 1549 tests_dict = output_dict.setdefault('tests', {}) 1550 for v in self._test_views: 1551 test_name = v.get_testname() 1552 test_info = tests_dict.setdefault(test_name, {}) 1553 test_info.update({ 1554 'status': v['status'], 1555 'attributes': v.get_control_file_attributes() or list(), 1556 'reason': v['reason'], 1557 'retry_count': self._retry_counts.get(v['test_idx'], 0), 1558 }) 1559 # For aborted test, the control file will not be parsed and thus 1560 # fail to get the attributes info. Therefore, the subsystems the 1561 # abort test testing will be missing. For this case, we will assume 1562 # the aborted test will test all subsystems, set subsystem:default. 1563 if (test_info['status'] == 'ABORT' and 1564 not any('subsystem:' in a for a in test_info['attributes'])): 1565 test_info['attributes'].append('subsystem:default') 1566 1567 # Write the links to test logs into the |tests_dict| of |output_dict|. 1568 # For test whose status is not 'GOOD', the link is also buildbot_link. 1569 for link in self._web_links: 1570 test_name = link.anchor.strip() 1571 test_info = tests_dict.get(test_name) 1572 if test_info: 1573 test_info['link_to_logs'] = link.url 1574 test_info['sponge_url'] = link.sponge_url 1575 # Write the retry dashboard link into the dict. 1576 if link in self.buildbot_links and link.testname: 1577 test_info['retry_dashboard_link'] \ 1578 = reporting_utils.link_retry_url(link.testname) 1579 # Always write the wmatrix link for compatibility. 1580 test_info['wmatrix_link'] \ 1581 = reporting_utils.link_wmatrix_retry_url(link.testname) 1582 # Write the bug url into the dict. 1583 if link.bug_id: 1584 test_info['bug_url'] = link.bug_url 1585 1586 # Write the suite timings into |output_dict| 1587 timings = self.timings 1588 if timings is not None: 1589 time_dict = output_dict.setdefault('suite_timings', {}) 1590 time_dict.update({ 1591 'download_start' : str(timings.download_start_time), 1592 'payload_download_end' : str(timings.payload_end_time), 1593 'suite_start' : str(timings.suite_start_time), 1594 'artifact_download_end' : str(timings.artifact_end_time), 1595 'tests_start' : str(timings.tests_start_time), 1596 'tests_end' : str(timings.tests_end_time), 1597 }) 1598 1599 output_dict['suite_job_id'] = self._suite_job_id 1600 1601 return output_dict 1602 1603 1604 def run(self): 1605 """Collect test results. 1606 1607 This method goes through the following steps: 1608 Fetch relevent test views of the suite job. 1609 Fetch test views of child jobs 1610 Check whether the suite was aborted. 1611 Generate links. 1612 Calculate suite timings. 1613 Compute return code based on the test result. 1614 1615 """ 1616 if self._solo_test_run: 1617 self._test_views, self._retry_counts, self._missing_results = ( 1618 self._fetch_test_views_of_child_jobs( 1619 jobs=self._afe.get_jobs(id=self._suite_job_id))) 1620 else: 1621 self._child_views, self._retry_counts, self._missing_results = ( 1622 self._fetch_test_views_of_child_jobs()) 1623 self._suite_views = self._fetch_relevant_test_views_of_suite() 1624 self._test_views = self._suite_views + self._child_views 1625 # For hostless job in Starting status, there is no test view associated. 1626 # This can happen when a suite job in Starting status is aborted. When 1627 # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone, 1628 # max_jobs_started_per_cycle, a suite job can stays in Starting status. 1629 if not self._test_views: 1630 self.return_result = _RETURN_RESULTS['test_views_missing'] 1631 return 1632 self.is_aborted = any([view['job_keyvals'].get('aborted_by') 1633 for view in self._suite_views]) 1634 self._generate_web_and_buildbot_links() 1635 self._record_timings() 1636 self._compute_return_code() 1637 1638 1639 def gather_timing_stats(self): 1640 """Collect timing related statistics.""" 1641 # Record suite runtime in metadata db. 1642 # Some failure modes can leave times unassigned, report sentinel value 1643 # in that case. 1644 runtime_in_secs = -1 1645 if (self.timings.tests_end_time is not None and 1646 self.timings.suite_start_time is not None): 1647 runtime_in_secs = (self.timings.tests_end_time - 1648 self.timings.suite_start_time).total_seconds() 1649 1650 1651def _make_child_dependencies(options): 1652 """Creates a list of extra dependencies for child jobs. 1653 1654 @param options: Parsed arguments to run_suite. 1655 1656 @returns: A list of label strings if any dependencies should be added. None 1657 otherwise. 1658 """ 1659 if not options.model: 1660 return () 1661 return ('model:%s' % options.model,) 1662 1663 1664@retry.retry(error.StageControlFileFailure, timeout_min=10) 1665def create_suite(afe, options): 1666 """Create a suite with retries. 1667 1668 @param afe: The afe object to insert the new suite job into. 1669 @param options: The options to use in creating the suite. 1670 1671 @return: The afe_job_id of the new suite job. 1672 """ 1673 logging.info('%s Submitted create_suite_job rpc', 1674 diagnosis_utils.JobTimer.format_time(datetime.now())) 1675 1676 # TODO(crbug.com/763207): This is to support calling old moblab RPC 1677 # with ToT code. This does not need to be supported after M62. 1678 if options.oldrpc: 1679 suite_args = options.suite_args 1680 if 'tests' in suite_args: 1681 # This is for test_that_wrapper 1682 suite_args = ' '.join([':lab:'] + suite_args['tests']) 1683 else: 1684 # This is for suite_attr_wrapper 1685 suite_args = repr(suite_args) 1686 options.suite_args = suite_args 1687 1688 return afe.run( 1689 'create_suite_job', 1690 name=options.name, 1691 board=options.board, 1692 builds=suite_common.make_builds_from_options(options), 1693 test_source_build=options.test_source_build, 1694 check_hosts=not options.no_wait, 1695 pool=options.pool, 1696 file_bugs=options.file_bugs, 1697 priority=options.priority, 1698 suite_args=options.suite_args, 1699 wait_for_results=not options.no_wait, 1700 timeout_mins=options.timeout_mins + options.delay_minutes, 1701 max_runtime_mins=options.max_runtime_mins + options.delay_minutes, 1702 job_retry=options.retry, 1703 max_retries=options.max_retries, 1704 suite_min_duts=options.suite_min_duts, 1705 offload_failures_only=options.offload_failures_only, 1706 run_prod_code=options.run_prod_code, 1707 delay_minutes=options.delay_minutes, 1708 job_keyvals=options.job_keyvals, 1709 test_args=options.test_args, 1710 child_dependencies=options.child_dependencies, 1711 ) 1712 1713 1714def _run_suite(options): 1715 """ 1716 run_suite script without exception handling. 1717 1718 @param options: The parsed options. 1719 1720 @returns: A tuple contains the return_code of run_suite and the dictionary 1721 of the output. 1722 1723 """ 1724 # If indicate to use the new style suite control file, convert the args 1725 if options.use_suite_attr: 1726 options = change_options_for_suite_attr(options) 1727 1728 log_name = _get_log_name(options) 1729 utils.setup_logging(logfile=log_name) 1730 1731 if not options.bypass_labstatus and not options.web: 1732 utils.check_lab_status(options.build) 1733 1734 afe = _create_afe(options) 1735 instance_server = afe.server 1736 1737 rpc_helper = diagnosis_utils.RPCHelper(afe) 1738 is_real_time = True 1739 if options.mock_job_id: 1740 job_id = int(options.mock_job_id) 1741 existing_job = afe.get_jobs(id=job_id, finished=True) 1742 if existing_job: 1743 is_real_time = False 1744 else: 1745 existing_job = afe.get_jobs(id=job_id) 1746 if existing_job: 1747 job_created_on = time_utils.date_string_to_epoch_time( 1748 existing_job[0].created_on) 1749 else: 1750 raise utils.TestLabException('Failed to retrieve job: %d' % job_id) 1751 else: 1752 try: 1753 rpc_helper.check_dut_availability(options.dependencies, 1754 options.minimum_duts, 1755 options.skip_duts_check) 1756 job_id = create_suite(afe, options) 1757 job_created_on = time.time() 1758 except (error.CrosDynamicSuiteException, 1759 error.RPCException, proxy.JSONRPCException) as e: 1760 logging.exception('Error Message: %s', e) 1761 return run_suite_common.SuiteResult( 1762 run_suite_common.RETURN_CODES.INFRA_FAILURE, 1763 {'return_message': str(e)}) 1764 except AttributeError as e: 1765 logging.exception('Error Message: %s', e) 1766 return run_suite_common.SuiteResult( 1767 run_suite_common.RETURN_CODES.INVALID_OPTIONS) 1768 1769 job_timer = diagnosis_utils.JobTimer( 1770 job_created_on, float(options.timeout_mins)) 1771 job_url = reporting_utils.link_job(job_id, 1772 instance_server=instance_server) 1773 logging.info('%s Created suite job: %s', 1774 job_timer.format_time(job_timer.job_created_time), 1775 job_url) 1776 logging.info(annotations.StepLink( 1777 text='Link to suite', 1778 url=job_url)) 1779 1780 if options.create_and_return: 1781 msg = '--create_and_return was specified, terminating now.' 1782 logging.info(msg) 1783 return run_suite_common.SuiteResult( 1784 run_suite_common.RETURN_CODES.OK, 1785 {'return_message': msg}) 1786 1787 if options.no_wait: 1788 return _handle_job_nowait(job_id, options, instance_server) 1789 else: 1790 return _handle_job_wait(afe, job_id, options, job_timer, is_real_time) 1791 1792 1793def _get_log_name(options): 1794 """Return local log file's name. 1795 1796 @param options: Parsed options. 1797 1798 @return log_name, a string file name. 1799 """ 1800 if options.require_logfile: 1801 # options.build is verified to exist in verify_options. 1802 # convert build name from containing / to containing only _. 1803 log_name = 'run_suite-%s.log' % options.build.replace('/', '_') 1804 log_dir = os.path.join(common.autotest_dir, 'logs') 1805 if os.path.exists(log_dir): 1806 log_name = os.path.join(log_dir, log_name) 1807 1808 return log_name 1809 else: 1810 return None 1811 1812 1813def _create_afe(options): 1814 """Return an afe instance based on options. 1815 1816 @param options Parsed options. 1817 1818 @return afe, an AFE instance. 1819 """ 1820 instance_server = (options.web if options.web else 1821 instance_for_pool(options.pool)) 1822 afe = frontend_wrappers.RetryingAFE(server=instance_server, 1823 timeout_min=options.afe_timeout_mins, 1824 delay_sec=options.delay_sec) 1825 logging.info('Autotest instance created: %s', instance_server) 1826 return afe 1827 1828 1829def _handle_job_wait(afe, job_id, options, job_timer, is_real_time): 1830 """Handle suite job synchronously. 1831 1832 @param afe AFE instance. 1833 @param job_id Suite job id. 1834 @param options Parsed options. 1835 @param job_timer JobTimer for suite job. 1836 @param is_real_time Whether or not to handle job timeout. 1837 1838 @return SuiteResult of suite job. 1839 """ 1840 rpc_helper = diagnosis_utils.RPCHelper(afe) 1841 instance_server = afe.server 1842 while not afe.get_jobs(id=job_id, finished=True): 1843 _poke_buildbot_with_output(afe, job_id, job_timer) 1844 if job_timer.debug_output_timer.poll(): 1845 logging.info('The suite job has another %s till timeout.', 1846 job_timer.timeout_hours - job_timer.elapsed_time()) 1847 time.sleep(10) 1848 logging.info('%s Suite job is finished.', 1849 diagnosis_utils.JobTimer.format_time(datetime.now())) 1850 # For most cases, ResultCollector should be able to determine whether 1851 # a suite has timed out by checking information in the test view. 1852 # However, occationally tko parser may fail on parsing the 1853 # job_finished time from the job's keyval file. So we add another 1854 # layer of timeout check in run_suite. We do the check right after 1855 # the suite finishes to make it as accurate as possible. 1856 # There is a minor race condition here where we might have aborted 1857 # for some reason other than a timeout, and the job_timer thinks 1858 # it's a timeout because of the jitter in waiting for results. 1859 # The consequence would be that run_suite exits with code 1860 # SUITE_TIMEOUT while it should have returned INFRA_FAILURE 1861 # instead, which should happen very rarely. 1862 # Note the timeout will have no sense when using -m option. 1863 is_suite_timeout = job_timer.is_suite_timeout() 1864 1865 # Extract the original suite name to record timing. 1866 original_suite_name = get_original_suite_name(options.name, 1867 options.suite_args) 1868 # Start collecting test results. 1869 logging.info('%s Start collecting test results and dump them to json.', 1870 diagnosis_utils.JobTimer.format_time(datetime.now())) 1871 TKO = frontend_wrappers.RetryingTKO(server=instance_server, 1872 timeout_min=options.afe_timeout_mins, 1873 delay_sec=options.delay_sec) 1874 # TODO(crbug.com/672348): It needs to be possible for provision 1875 # suite to pass if only a few tests fail. Otherwise, a single 1876 # failing test will be reported as failure even if the suite reports 1877 # success. 1878 if options.name == _PROVISION_SUITE: 1879 # TODO(crbug.com/672348): Creating the suite job requires that 1880 # suite_args contains num_required. 1881 return_code_function = _ProvisionReturnCodeComputer( 1882 num_required=options.suite_args['num_required']) 1883 else: 1884 return_code_function = _ReturnCodeComputer() 1885 collector = ResultCollector(instance_server=instance_server, 1886 afe=afe, tko=TKO, build=options.build, 1887 suite_name=options.name, 1888 suite_job_id=job_id, 1889 return_code_function=return_code_function, 1890 original_suite_name=original_suite_name) 1891 collector.run() 1892 # Dump test outputs into json. 1893 output_dict = collector.get_results_dict() 1894 output_dict['autotest_instance'] = instance_server 1895 if not options.json_dump: 1896 collector.output_results() 1897 result = collector.return_result 1898 if is_real_time: 1899 # Do not record stats if the suite was aborted (either by a user 1900 # or through the golo rpc). 1901 # Also do not record stats if is_aborted is None, indicating 1902 # aborting status is unknown yet. 1903 if collector.is_aborted == False: 1904 logging.info('%s Gathering timing stats for the suite job.', 1905 diagnosis_utils.JobTimer.format_time(datetime.now())) 1906 collector.gather_timing_stats() 1907 1908 if collector.is_aborted == True and is_suite_timeout: 1909 # There are two possible cases when a suite times out. 1910 # 1. the suite job was aborted due to timing out 1911 # 2. the suite job succeeded, but some child jobs 1912 # were already aborted before the suite job exited. 1913 # The case 2 was handled by ResultCollector, 1914 # here we handle case 1. 1915 result |= _RETURN_RESULTS['suite_timeout'] 1916 logging.info('\n %s Attempting to display pool info: %s', 1917 diagnosis_utils.JobTimer.format_time(datetime.now()), 1918 options.pool) 1919 try: 1920 # Add some jitter to make up for any latency in 1921 # aborting the suite or checking for results. 1922 cutoff = job_timer.timeout_hours + timedelta(hours=0.3) 1923 rpc_helper.diagnose_pool(options.dependencies, cutoff) 1924 except proxy.JSONRPCException: 1925 logging.warning('Unable to display pool info.') 1926 1927 # And output return message. 1928 if result.message: 1929 logging.info('Reason: %s', result.message) 1930 1931 logging.info('\n %s Output below this line is for buildbot consumption:', 1932 diagnosis_utils.JobTimer.format_time(datetime.now())) 1933 log_buildbot_links(logging.info, collector.buildbot_links) 1934 return result.suite_result(output_dict) 1935 1936 1937def _handle_job_nowait(job_id, options, instance_server): 1938 """Handle suite job asynchronously. 1939 1940 @param job_id Suite job id. 1941 @param options Parsed options. 1942 @param instance_server Autotest instance hostname. 1943 1944 @return SuiteResult of suite job. 1945 """ 1946 logging.info('Created suite job: %r', job_id) 1947 link = LogLink(options.name, instance_server, 1948 '%s-%s' % (job_id, getpass.getuser())) 1949 for generate_link in link.GenerateBuildbotLinks(): 1950 logging.info(generate_link) 1951 logging.info('--no_wait specified; Exiting.') 1952 return run_suite_common.SuiteResult( 1953 run_suite_common.RETURN_CODES.OK, 1954 {'return_message': '--no_wait specified; Exiting.'}) 1955 1956 1957def _should_run(options): 1958 """Check whether the suite should be run based on lab/job status checking. 1959 1960 @param options Parsed options. 1961 """ 1962 try: 1963 site_utils.check_lab_status(options.test_source_build) 1964 except site_utils.TestLabException as ex: 1965 logging.exception('Lab is closed or build is blocked. Skipping ' 1966 'suite %s, board %s, build %s: %s', 1967 options.name, options.board, 1968 options.test_source_build, str(ex)) 1969 return False 1970 1971 start_time = str(datetime.now() - 1972 timedelta(days=_SEARCH_JOB_MAX_DAYS)) 1973 afe = _create_afe(options) 1974 afe_jobs = afe.get_jobs( 1975 name__istartswith=options.test_source_build, 1976 name__iendswith='control.'+options.name, 1977 created_on__gte=start_time, 1978 min_rpc_timeout=_MIN_RPC_TIMEOUT) 1979 if options.model: 1980 model_tag = 'model:%s' % options.model 1981 filtered_jobs = [j for j in afe_jobs if model_tag in j.control_file] 1982 else: 1983 filtered_jobs = afe_jobs 1984 1985 if filtered_jobs: 1986 logging.info('Found duplicate suite %s scheduled in past.', 1987 filtered_jobs) 1988 return False 1989 1990 return True 1991 1992 1993def _poke_buildbot_with_output(afe, job_id, job_timer): 1994 """Poke buildbot so it doesn't timeout from silence. 1995 1996 @param afe AFE instance. 1997 @param job_id Suite job id. 1998 @param job_timer JobTimer for suite job. 1999 """ 2000 rpc_helper = diagnosis_utils.RPCHelper(afe) 2001 # Note that this call logs output, preventing buildbot's 2002 # 9000 second silent timeout from kicking in. Let there be no 2003 # doubt, this is a hack. The timeout is from upstream buildbot and 2004 # this is the easiest work around. 2005 if job_timer.first_past_halftime(): 2006 rpc_helper.diagnose_job(job_id, afe.server) 2007 2008 2009 2010def _run_task(options): 2011 """Perform this script's function minus setup. 2012 2013 Boilerplate like argument parsing, logging, output formatting happen 2014 elsewhere. 2015 2016 Returns a SuiteResult instance. 2017 2018 TODO(ayatane): The try/except should be moved into _run_suite(). 2019 Good luck trying to figure out which function calls are supposed to 2020 raise which of the exceptions. 2021 """ 2022 try: 2023 return _run_suite(options) 2024 except diagnosis_utils.DUTsNotAvailableError as e: 2025 result = run_suite_common.SuiteResult( 2026 run_suite_common.RETURN_CODES.BOARD_NOT_AVAILABLE, 2027 {'return_message': 'Skipping testing: %s' % e.message}) 2028 logging.info(result.output_dict['return_message']) 2029 return result 2030 except utils.TestLabException as e: 2031 result = run_suite_common.SuiteResult( 2032 run_suite_common.RETURN_CODES.INFRA_FAILURE, 2033 {'return_message': 'TestLabException: %s' % e}) 2034 logging.exception(result.output_dict['return_message']) 2035 return result 2036 2037 2038class _ExceptionHandler(object): 2039 """Global exception handler replacement.""" 2040 2041 def __init__(self, dump_json): 2042 """Initialize instance. 2043 2044 @param dump_json: Whether to print a JSON dump of the result dict to 2045 stdout. 2046 """ 2047 self._should_dump_json = dump_json 2048 2049 def __call__(self, exc_type, value, traceback): 2050 if self._should_dump_json: 2051 run_suite_common.dump_json( 2052 {'return_message': ('Unhandled run_suite exception: %s' 2053 % value)}) 2054 sys.exit(run_suite_common.RETURN_CODES.INFRA_FAILURE) 2055 2056 2057def _check_if_use_skylab(options): 2058 """Detect whether to run suite in skylab.""" 2059 if not _ENABLE_RUN_SUITE_TRAMPOLINE: 2060 logging.info('trampoline to skylab is not enabled.') 2061 return False 2062 2063 task_info = 'suite:%s, board:%s, model:%s, pool:%s' % ( 2064 options.name, options.board, options.model, options.pool) 2065 ctx = gs.GSContext() 2066 with osutils.TempDir(prefix='trampoline_') as tempdir: 2067 temp_file = os.path.join(tempdir, _MIGRATION_CONFIG_FILE) 2068 ctx.Copy(_TRAMPOLINE_CONFIG, temp_file) 2069 _migration_config = config_reader.MigrationConfig( 2070 config_reader.ConfigReader(temp_file)) 2071 2072 logging.info('Checking whether to run in skylab: Task(%s)', task_info) 2073 if skylab.should_run_in_skylab(_migration_config, 2074 options.board, 2075 options.model, 2076 options.name, 2077 options.pool): 2078 logging.info('Task (%s) Should run in skylab', task_info) 2079 return True 2080 2081 logging.info('Task (%s) Should run in autotest', task_info) 2082 return False 2083 2084 2085def _run_with_skylab(options): 2086 """Run suite inside skylab.""" 2087 # TODO(xixuan): Implement running suite in skylab. 2088 return _RETURN_RESULTS['ok'] 2089 2090 2091def _run_with_autotest(options): 2092 """Run suite inside autotest.""" 2093 if options.pre_check and not _should_run(options): 2094 logging.info('Suite %s-%s is terminated: Lab is closed, OR build is ' 2095 'blocked, OR this suite has already been kicked off ' 2096 'once in past %d days.', 2097 options.test_source_build, options.name, 2098 _SEARCH_JOB_MAX_DAYS) 2099 result = run_suite_common.SuiteResult( 2100 run_suite_common.RETURN_CODES.ERROR, 2101 {'return_message': ("Lab is closed OR other reason" 2102 " (see code, it's complicated)")}) 2103 else: 2104 result = _run_task(options) 2105 2106 if options.json_dump: 2107 run_suite_common.dump_json(result.output_dict) 2108 2109 return result 2110 2111 2112def main(): 2113 """Entry point.""" 2114 utils.verify_not_root_user() 2115 2116 parser = make_parser() 2117 options = parser.parse_args() 2118 if options.do_nothing: 2119 return 0 2120 2121 sys.exceptionhandler = _ExceptionHandler(dump_json=options.json_dump) 2122 if options.json_dump: 2123 logging.disable(logging.CRITICAL) 2124 2125 options_okay = verify_and_clean_options(options) 2126 # Set StreamHandler first to capture error messages if suite is not run. 2127 utils.setup_logging() 2128 if not options_okay: 2129 parser.print_help() 2130 result = run_suite_common.SuiteResult( 2131 run_suite_common.RETURN_CODES.INVALID_OPTIONS) 2132 else: 2133 if _check_if_use_skylab(options): 2134 result = _run_with_skylab(options) 2135 else: 2136 result = _run_with_autotest(options) 2137 2138 logging.info('Will return from run_suite with status: %s', 2139 run_suite_common.RETURN_CODES.get_string(result.return_code)) 2140 return result.return_code 2141 2142 2143if __name__ == "__main__": 2144 sys.exit(main()) 2145