• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7
8"""Tool for running suites of tests and waiting for completion.
9
10The desired test suite will be scheduled with autotest. By default,
11this tool will block until the job is complete, printing a summary
12at the end.  Error conditions result in exceptions.
13
14This is intended for use only with Chrome OS test suits that leverage the
15dynamic suite infrastructure in server/cros/dynamic_suite.py.
16
17This script exits with one of the following codes:
180 - OK: Suite finished successfully
191 - ERROR: Test(s) failed, or hits its own timeout
202 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out.
213 - INFRA_FAILURE: Infrastructure related issues, e.g.
22    * Lab is down
23    * Too many duts (defined as a constant) in repair failed status
24    * Suite job issues, like bug in dynamic suite,
25      user aborted the suite, lose a drone/all devservers/rpc server,
26      0 tests ran, etc.
27    * provision failed
28      TODO(fdeng): crbug.com/413918, reexamine treating all provision
29                   failures as INFRA failures.
304 - SUITE_TIMEOUT: Suite timed out, some tests ran,
31    none failed by the time the suite job was aborted. This will cover,
32    but not limited to, the following cases:
33    * A devserver failure that manifests as a timeout
34    * No DUTs available midway through a suite
35    * Provision/Reset/Cleanup took longer time than expected for new image
36    * A regression in scheduler tick time.
375- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool.
386- INVALID_OPTIONS: If options are not valid.
39"""
40
41import argparse
42import ast
43import collections
44from datetime import datetime
45from datetime import timedelta
46import functools
47import getpass
48import logging
49import os
50import re
51import sys
52import time
53import warnings
54
55import common
56from chromite.lib import buildbot_annotations as annotations
57from chromite.lib import gs
58from chromite.lib import osutils
59
60from django.core import exceptions as django_exceptions
61
62try:
63    from suite_scheduler import config_reader
64    from suite_scheduler import skylab
65except ImportError:
66    # For unittest
67    config_reader = None
68    skylab = None
69
70from autotest_lib.client.common_lib import control_data
71from autotest_lib.client.common_lib import error
72from autotest_lib.client.common_lib import global_config
73from autotest_lib.client.common_lib import priorities
74from autotest_lib.client.common_lib import time_utils
75from autotest_lib.client.common_lib.cros import retry
76from autotest_lib.frontend.afe import rpc_client_lib
77from autotest_lib.frontend.afe.json_rpc import proxy
78from autotest_lib.server import site_utils
79from autotest_lib.server import utils
80from autotest_lib.server.cros.dynamic_suite import constants
81from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
82from autotest_lib.server.cros.dynamic_suite import reporting_utils
83from autotest_lib.server.cros.dynamic_suite import suite_common
84from autotest_lib.server.cros.dynamic_suite import tools
85try:
86    from autotest_lib.site_utils import diagnosis_utils
87except django_exceptions.ImproperlyConfigured as e:
88    if 'Error loading MySQLdb module: libmariadbclient' in str(e):
89        logging.error('Unable to import a necessary MySQLdb module. This is '
90                      'commonly caused by running a command inside[outside] '
91                      'of the chroot but having autotest utility packages '
92                      'that were build outside[inside] the chroot. '
93                      'Please re-run utils/build_externals.py inside[outside] '
94                      'of the chroot accordingly.')
95    raise
96from autotest_lib.site_utils import run_suite_common
97
98CONFIG = global_config.global_config
99
100_DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value(
101        'SERVER', 'hostname', type=str)
102_URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
103_ENABLE_RUN_SUITE_TRAMPOLINE = CONFIG.get_config_value(
104        'CROS', 'enable_run_suite_trampoline', type=bool, default=False)
105
106_MIGRATION_CONFIG_FILE = 'migration_config.ini'
107_MIGRATION_CONFIG_BUCKET = 'suite-scheduler.google.com.a.appspot.com'
108_TRAMPOLINE_CONFIG = 'gs://%s/%s' % (_MIGRATION_CONFIG_BUCKET,
109                                     _MIGRATION_CONFIG_FILE)
110
111# Minimum RPC timeout setting for calls expected to take long time, e.g.,
112# create_suite_job. If default socket time (socket.getdefaulttimeout()) is
113# None or greater than this value, the default will be used.
114# The value here is set to be the same as the timeout for the RetryingAFE object
115# so long running RPCs can wait long enough before being aborted.
116_MIN_RPC_TIMEOUT = 600
117
118# Number of days back to search for existing job.
119_SEARCH_JOB_MAX_DAYS = 14
120
121_PROVISION_SUITE = 'provision'
122
123
124@functools.total_ordering
125class _ReturnResult(object):
126    """Represents overall result of run_suite operation.
127
128    _ReturnResult instances sort based on priority (the order in
129    _RETURN_RESULTS).
130
131    Furthermore, _ReturnResult instances can be combined by bitwise or
132    ("union"), which returns the instance with the higher priority
133    between the two (the instance with higher priority is a "superset"
134    of the other).
135
136    Do not create new instances of this; use _RETURN_RESULTS instead.
137    """
138
139    def __init__(self, return_code, message):
140        self.return_code = return_code
141        self.message = message
142
143    def __repr__(self):
144        return '<{cls} {key}, {this.return_code}, {this.message}>'.format(
145            cls=type(self).__name__,
146            key=self._getkey(),
147            this=self)
148
149    def __gt__(self, other):
150        if isinstance(other, type(self)):
151            return self._getkey() > other._getkey()
152        else:
153            return NotImplemented
154
155    def __eq__(self, other):
156        if isinstance(other, type(self)):
157            return (self.return_code == other.return_code
158                    and self.message == other.message)
159        else:
160            return NotImplemented
161
162    def __hash__(self):
163        return hash(self.return_code) ^ hash(self.message)
164
165    def __or__(self, other):
166        if isinstance(other, type(self)):
167            if self > other:
168                return self
169            else:
170                return other
171        else:
172            return NotImplemented
173
174    def _getkey(self):
175        """Return sort key."""
176        return _RETURN_RESULTS_LIST.index(self)
177
178    def suite_result(self, output_dict=None):
179        """Make a SuiteResult using this _ReturnResult.
180
181        @param output_dict: output_dict to merge into SuiteResult.
182        """
183        if output_dict is None:
184            output_dict = dict()
185        else:
186            output_dict = output_dict.copy()
187        if self.message:
188            output_dict['return_message'] = self.message
189        return run_suite_common.SuiteResult(self.return_code, output_dict)
190
191
192_RETURN_RESULTS = collections.OrderedDict([
193    ('ok', _ReturnResult(run_suite_common.RETURN_CODES.OK, '')),
194
195    ('test_warning', _ReturnResult(
196        run_suite_common.RETURN_CODES.WARNING, 'Test job raised warning.')),
197    ('suite_warning', _ReturnResult(
198        run_suite_common.RETURN_CODES.WARNING, 'Suite job raised warning.')),
199    ('test_retry', _ReturnResult(
200        run_suite_common.RETURN_CODES.WARNING, 'Tests were retried.')),
201
202    ('test_aborted_prestart', _ReturnResult(
203        run_suite_common.RETURN_CODES.SUITE_TIMEOUT,
204        'Tests were aborted before running; suite must have timed out.')),
205    # This really indicates a user action or an infra failure. But, suite
206    # timeouts cause similar fauilres in the individual tests, so we must
207    # classify these lower than suite_timeout. In case of a suite_timeout, the
208    # result from the suite job will promote the result to suite_timeout.
209    ('test_aborted_mystery',
210     _ReturnResult(
211             run_suite_common.RETURN_CODES.SUITE_TIMEOUT,
212             'Tests were aborted after running, but before timeout; '
213             'Test was manually aborted or parsing results failed: '
214             'crbug.com/796348.')),
215    ('suite_timeout', _ReturnResult(
216        run_suite_common.RETURN_CODES.SUITE_TIMEOUT, 'Suite job timed out.')),
217
218    ('test_views_missing', _ReturnResult(
219        run_suite_common.RETURN_CODES.INFRA_FAILURE, 'No test views found.')),
220    ('suite_failed', _ReturnResult(
221        run_suite_common.RETURN_CODES.INFRA_FAILURE, 'Suite job failed.')),
222    ('provision_failed', _ReturnResult(
223        run_suite_common.RETURN_CODES.INFRA_FAILURE, 'Provisioning failed.')),
224
225    ('test_failure', _ReturnResult(
226        run_suite_common.RETURN_CODES.ERROR, 'Tests failed.')),
227])
228_RETURN_RESULTS_LIST = list(_RETURN_RESULTS.values())
229
230
231def bool_str(x):
232    """Boolean string type for option arguments.
233
234    @param x: string representation of boolean value.
235
236    """
237    if x == 'True':
238        return True
239    elif x == 'False':
240        return False
241    else:
242        raise argparse.ArgumentTypeError(
243            '%s is not one of True or False' % (x,))
244
245
246def _get_priority_value(x):
247    """Convert a priority representation to its int value.
248
249    Priorities can be described either by an int value (possibly as a string)
250    or a name string.  This function coerces both forms to an int value.
251
252    This function is intended for casting command line arguments during
253    parsing.
254
255    @param x: priority value as an int, int string, or name string
256
257    @returns: int value of priority
258    """
259    try:
260        return int(x)
261    except ValueError:
262        try:
263            return priorities.Priority.get_value(x)
264        except AttributeError:
265            raise argparse.ArgumentTypeError(
266                'Unknown priority level %s.  Try one of %s.'
267                % (x, ', '.join(priorities.Priority.names)))
268
269
270def make_parser():
271    """Make ArgumentParser instance for run_suite.py."""
272    parser = argparse.ArgumentParser(
273        usage="%(prog)s [options]")
274    parser.add_argument("-b", "--board", dest="board")
275    parser.add_argument(
276            "--model",
277            help="The device model to run tests against. For non-unified "
278                 "builds, model and board are synonymous, but board is more "
279                 "accurate in some cases. Only pass this option if your build "
280                 "is a unified build.",
281    )
282    parser.add_argument("-i", "--build", dest="build")
283    parser.add_argument(
284        "-w", "--web", dest="web", default=None,
285        help="Address of a webserver to receive suite requests.")
286    parser.add_argument(
287        '--cheets_build', dest='cheets_build', default=None,
288        help='ChromeOS Android build to be installed on dut.')
289    parser.add_argument(
290        '--firmware_rw_build', dest='firmware_rw_build', default=None,
291        help='Firmware build to be installed in dut RW firmware.')
292    parser.add_argument(
293        '--firmware_ro_build', dest='firmware_ro_build', default=None,
294        help='Firmware build to be installed in dut RO firmware.')
295    parser.add_argument(
296        '--test_source_build', dest='test_source_build', default=None,
297        help=('Build that contains the test code, '
298              'e.g., it can be the value of `--build`, '
299              '`--firmware_rw_build` or `--firmware_ro_build` '
300              'arguments. Default is None, that is, use the test '
301              'code from `--build` (CrOS image)'))
302    #  This should just be a boolean flag, but the autotest "proxy" code
303    #  can't handle flags that don't take arguments.
304    parser.add_argument(
305        "-n", "--no_wait", dest="no_wait", default=False, type=bool_str,
306        help='Must pass "True" or "False" if used.')
307    # If you really want no pool, --pool="" will do it. USE WITH CARE.
308    parser.add_argument("-p", "--pool", dest="pool", default="suites")
309    parser.add_argument("-s", "--suite_name", dest="name")
310    parser.add_argument("-a", "--afe_timeout_mins", type=int,
311                        dest="afe_timeout_mins", default=30)
312    parser.add_argument("-t", "--timeout_mins", type=int,
313                        dest="timeout_mins", default=1440)
314    parser.add_argument("-x", "--max_runtime_mins", type=int,
315                        dest="max_runtime_mins", default=1440)
316    parser.add_argument("-d", "--delay_sec", type=int,
317                        dest="delay_sec", default=10)
318    parser.add_argument("-m", "--mock_job_id", dest="mock_job_id",
319                        help="Attach to existing job id for already running "
320                        "suite, and creates report.")
321    # NOTE(akeshet): This looks similar to --no_wait, but behaves differently.
322    # --no_wait is passed in to the suite rpc itself and affects the suite,
323    # while this does not.
324    parser.add_argument("-c", "--create_and_return", dest="create_and_return",
325                        action="store_true",
326                        help="Create the suite and print the job id, then "
327                        "finish immediately.")
328    parser.add_argument("-u", "--num", dest="num", type=int, default=None,
329                        help="Deprecated, does nothing.")
330    #  Same boolean flag issue applies here.
331    parser.add_argument(
332        "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str,
333        help=('File bugs on test failures. Must pass "True" or '
334              '"False" if used.'))
335    parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus",
336                        action="store_true", help='Bypass lab status check.')
337    # We allow either a number or a string for the priority.  This way, if you
338    # know what you're doing, one can specify a custom priority level between
339    # other levels.
340    parser.add_argument("-r", "--priority", dest="priority",
341                        type=_get_priority_value,
342                        default=priorities.Priority.DEFAULT,
343                        action="store",
344                        help="Priority of suite. Either numerical value, or "
345                        "one of (" + ", ".join(priorities.Priority.names)
346                        + ").")
347    parser.add_argument(
348        '--retry', dest='retry', default=False, type=bool_str, action='store',
349        help='Enable test retry.  Must pass "True" or "False" if used.')
350    parser.add_argument('--max_retries', dest='max_retries', default=None,
351                        type=int, action='store', help='Maximum retries'
352                        'allowed at suite level. No limit if not specified.')
353    parser.add_argument('--minimum_duts', dest='minimum_duts', type=int,
354                        default=0, action='store',
355                        help='Check that the pool has at least such many '
356                        'healthy machines, otherwise suite will not run. '
357                        'Default to 0.')
358    parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int,
359                        default=0, action='store',
360                        help='Preferred minimum number of machines. Scheduler '
361                        'will prioritize on getting such many machines for '
362                        'the suite when it is competing with another suite '
363                        'that has a higher priority but already got minimum '
364                        'machines it needs. Default to 0.')
365    parser.add_argument("--suite_args", dest="suite_args",
366                        type=ast.literal_eval,
367                        default=None, action="store",
368                        help="A dict of args passed to the suite control file.")
369    parser.add_argument('--offload_failures_only',
370                        dest='offload_failures_only', type=bool_str,
371                        action='store', default=False,
372                        help='Only enable gs_offloading for failed tests. '
373                        'Successful tests will be deleted. Must pass "True"'
374                        ' or "False" if used.')
375    parser.add_argument('--use_suite_attr', dest='use_suite_attr',
376                        action='store_true', default=False,
377                        help='Advanced. Run the suite based on ATTRIBUTES of '
378                        'control files, rather than SUITE.')
379    parser.add_argument('--json_dump', dest='json_dump', action='store_true',
380                        default=False,
381                        help='Dump the output of run_suite to stdout.')
382    parser.add_argument(
383        '--run_prod_code', dest='run_prod_code',
384        action='store_true', default=False,
385        help='Run the test code that lives in prod aka the test '
386        'code currently on the lab servers.')
387    parser.add_argument(
388        '--delay_minutes', type=int, default=0,
389        help=('Delay the creation of test jobs for a given '
390              'number of minutes. This argument can be used to '
391              'force provision jobs being delayed, which helps '
392              'to distribute loads across devservers.'))
393    parser.add_argument(
394        '--skip_duts_check', dest='skip_duts_check', action='store_true',
395        default=False, help='If True, skip minimum available DUTs check')
396    parser.add_argument(
397        '--job_keyvals', dest='job_keyvals', type=ast.literal_eval,
398        action='store', default=None,
399        help='A dict of job keyvals to be inject to suite control file')
400    parser.add_argument(
401        '--test_args', dest='test_args', type=ast.literal_eval,
402        action='store', default=None,
403        help=('A dict of args passed all the way to each individual test that '
404              'will be actually ran.'))
405    parser.add_argument(
406        '--require_logfile', action='store_true',
407        help=('Stream logs of run_suite.py to a local file named '
408              'run_suite-<build name>.log.'))
409
410    # Used for monitoring purposes, to measure no-op swarming proxy latency.
411    parser.add_argument('--do_nothing', action='store_true',
412                        help=argparse.SUPPRESS)
413
414    # Used when lab/job status checking is needed. Currently its only user is
415    # suite scheduler v2.
416    parser.add_argument(
417        '--pre_check', action='store_true',
418        help=('Check lab and job status before kicking off a suite. Used by '
419              'suite scheduler v2.'))
420
421    # TODO(crbug.com/763207): This is to support calling old moblab RPC
422    # with ToT code.  This does not need to be supported after M62.
423    parser.add_argument('--oldrpc', action='store_true',
424                        help='Use old AFE RPC.')
425
426    return parser
427
428
429def verify_and_clean_options(options):
430    """Verify the validity of options.
431
432    @param options: The parsed options to verify.
433
434    @returns: True if verification passes, False otherwise.
435
436    """
437    if options.mock_job_id and (
438            not options.build or not options.name or not options.board):
439        print ('When using -m, need to specify build, board and suite '
440               'name which you have used for creating the original job')
441        return False
442    else:
443        if not options.build:
444            print 'Need to specify which build to use'
445            return False
446        if not options.board:
447            print 'Need to specify board'
448            return False
449        if not options.name:
450            print 'Need to specify suite name'
451            return False
452    if options.num is not None:
453        warnings.warn('-u/--num option is deprecated; it does nothing.')
454    del options.num
455    if not options.retry and options.max_retries is not None:
456        print 'max_retries can only be used with --retry=True'
457        return False
458    if options.use_suite_attr and options.suite_args is not None:
459        print ('The new suite control file cannot parse the suite_args: %s.'
460               'Please not specify any suite_args here.' % options.suite_args)
461        return False
462    if options.no_wait and options.retry:
463        print 'Test retry is not available when using --no_wait=True'
464    # Default to use the test code in CrOS build.
465    if not options.test_source_build and options.build:
466        options.test_source_build = options.build
467    options.child_dependencies = _make_child_dependencies(options)
468    base_dependencies = ('board:%s' % options.board,
469                         'pool:%s' % options.pool)
470    options.dependencies = base_dependencies + options.child_dependencies
471    return True
472
473
474def change_options_for_suite_attr(options):
475    """Change options to be prepared to run the suite_attr_wrapper.
476
477    If specify 'use_suite_attr' from the cmd line, it indicates to run the
478    new style suite control file, suite_attr_wrapper. Then, change the
479    options.name to 'suite_attr_wrapper', change the options.suite_args to
480    include the arguments needed by suite_attr_wrapper.
481
482    @param options: The verified options.
483
484    @returns: The changed options.
485
486    """
487    # Convert the suite_name to attribute boolean expression.
488    if type(options.name) is str:
489        attr_filter_val = 'suite:%s' % options.name
490    else:
491        attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name])
492
493    # change the suite_args to be a dict of arguments for suite_attr_wrapper
494    # if suite_args is not None, store the values in 'other_args' of the dict
495    args_dict = {}
496    args_dict['attr_filter'] = attr_filter_val
497    options.suite_args = args_dict
498    options.name = 'suite_attr_wrapper'
499
500    return options
501
502
503class TestResult(object):
504
505    """Represents the result of a TestView."""
506
507    def __init__(self, test_view, retry_count=0):
508        """Initialize instance.
509
510        @param test_view: TestView instance.
511        @param retry_count: Retry count for test.  Optional.
512        """
513        self.name = test_view.get_testname()
514        self.status = test_view['status']
515        self.reason = test_view['reason']
516        self.retry_count = retry_count
517
518    _PRETTY_STATUS_MAP = {
519        'GOOD':    '[ PASSED ]',
520        'TEST_NA': '[  INFO  ]',
521    }
522
523    @property
524    def _pretty_status(self):
525        """Pretty status string."""
526        return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]')
527
528    def log_using(self, log_function, name_column_width):
529        """Log the test result using the given log function.
530
531        @param log_function: Log function to use.  Example: logging.info
532        @param name_column_width: Width of name column for formatting.
533        """
534        padded_name = self.name.ljust(name_column_width)
535        log_function('%s%s', padded_name, self._pretty_status)
536        if self.status != 'GOOD':
537            log_function('%s  %s: %s', padded_name, self.status, self.reason)
538        if self.retry_count > 0:
539            log_function('%s  retry_count: %s', padded_name, self.retry_count)
540
541
542def get_original_suite_name(suite_name, suite_args):
543    """Get the original suite name when running suite_attr_wrapper.
544
545    @param suite_name: the name of the suite launched in afe. When it is
546                       suite_attr_wrapper, the suite that actually running is
547                       specified in the suite_args.
548    @param suite_args: dict of suite args from argument parsing.
549
550    @returns: the original suite name.
551
552    """
553    if suite_name == 'suite_attr_wrapper':
554        attrs = suite_args.get('attr_filter', '')
555        suite_list = ([x[6:] for x in re.split('[() ]', attrs)
556                       if x and x.startswith('suite:')])
557        return suite_list[0] if suite_list else suite_name
558    return suite_name
559
560
561class LogLink(object):
562    """Information needed to record a link in the logs.
563
564    Depending on context and the information provided at
565    construction time, the link may point to either to log files for
566    a job, or to a bug filed for a failure in the job.
567
568    @var anchor  The link text.
569    @var url     The link url.
570    @var bug_id  Id of a bug to link to, or None.
571    """
572
573    # A list of tests that don't get retried so skip the dashboard.
574    _SKIP_RETRY_DASHBOARD = ['provision']
575
576    _BUG_LINK_PREFIX = 'Auto-Bug'
577    _LOG_LINK_PREFIX = 'Test-Logs'
578
579
580    def __init__(self, anchor, server, job_string, bug_info=None, reason=None,
581                 retry_count=0, testname=None, sponge_url=None):
582        """Initialize the LogLink by generating the log URL.
583
584        @param anchor      The link text.
585        @param server      The hostname of the server this suite ran on.
586        @param job_string  The job whose logs we'd like to link to.
587        @param bug_info    Info about the bug, if one was filed.
588        @param reason      A string representing the reason of failure if any.
589        @param retry_count How many times the test has been retried.
590        @param testname    Optional Arg that supplies the testname.
591        @param sponge_url  url to Sponge result.
592        """
593        self.anchor = anchor
594        self.url = _URL_PATTERN % (rpc_client_lib.add_protocol(server),
595                                   job_string)
596        self.reason = reason
597        self.retry_count = retry_count
598        self.testname = testname
599        self.sponge_url = sponge_url
600        if bug_info:
601            self.bug_id, self.bug_count = bug_info
602        else:
603            self.bug_id = None
604            self.bug_count = None
605
606
607    @property
608    def bug_url(self):
609        """URL of associated bug."""
610        if self.bug_id:
611            return reporting_utils.link_crbug(self.bug_id)
612        else:
613            return None
614
615
616    @property
617    def _bug_count_text(self):
618        """Return bug count as human friendly text."""
619        if self.bug_count is None:
620            bug_info = 'unknown number of reports'
621        elif self.bug_count == 1:
622            bug_info = 'new report'
623        else:
624            bug_info = '%s reports' % self.bug_count
625        return bug_info
626
627
628    def GenerateBuildbotLinks(self):
629        """Generate a link formatted to meet buildbot expectations.
630
631        If there is a bug associated with this link, report a link to the bug
632        and a link to the job logs; otherwise report a link to the job logs.
633
634        @return A generator of links formatted for the buildbot log annotator.
635        """
636        if self.bug_url:
637            yield self._get_link_to_bug()
638        yield self._get_link_to_job_logs()
639
640
641    def _get_link_to_bug(self):
642        """Return buildbot link to bug.
643
644        @return A link formatted for the buildbot log annotator.
645        """
646        info_strings = self._get_info_strings()
647        info_strings.append(self._bug_count_text)
648        anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX,
649                                               info_strings)
650        return annotations.StepLink(anchor_text, self.bug_url)
651
652
653    def _get_link_to_job_logs(self):
654        """Return buildbot link to job logs.
655
656        @return A link formatted for the buildbot log annotator.
657        """
658        anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX,
659                                               self._get_info_strings())
660        return annotations.StepLink(anchor_text, self.url)
661
662
663    def _get_info_strings(self):
664        """Return a list of info strings for _format_anchor_text()."""
665        info_strings = []
666        if self.retry_count > 0:
667            info_strings.append('retry_count: %d' % self.retry_count)
668        if self.reason:
669            info_strings.append(self.reason)
670        return info_strings
671
672
673    def _format_anchor_text(self, prefix, info_strings):
674        """Format anchor text given a prefix and info strings.
675
676        @param prefix        The prefix of the anchor text.
677        @param info_strings  Iterable of strings.
678        @return A anchor_text with the right prefix and info strings.
679        """
680        return '[{prefix}]: {anchor}: {info}'.format(
681            prefix=prefix,
682            anchor=self.anchor.strip(),
683            info=', '.join(info_strings))
684
685    @property
686    def text_link(self):
687        """Link to the job's logs, for consumption by a human.
688
689        @return A link formatted for human readability.
690        """
691        return '%s %s' % (self.anchor, self.url)
692
693    def GenerateRetryLink(self):
694        """Generate a link to the retry dashboard.
695
696        @return A link formatted for the buildbot log annotator.
697        """
698        if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
699            return None
700
701        # TODO(xixuan): Return the right flake dashboard later.
702        return None
703
704    def GenerateHistoryLink(self):
705        """Generate a link to the test history dashboard.
706
707        @return A link formatted for the buildbot log annotator.
708        """
709        if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
710            return None
711        return annotations.StepLink(
712            text='[Test-History]: %s' % self.testname,
713            url=reporting_utils.link_test_history(self.testname))
714
715
716class Timings(object):
717    """Timings for important events during a suite.
718
719    All timestamps are datetime.datetime objects.
720
721    @var suite_job_id: the afe job id of the suite job for which
722                       we are recording the timing for.
723    @var download_start_time: the time the devserver starts staging
724                              the build artifacts. Recorded in create_suite_job.
725    @var payload_end_time: the time when the artifacts only necessary to start
726                           installsing images onto DUT's are staged.
727                           Recorded in create_suite_job.
728    @var artifact_end_time: the remaining artifacts are downloaded after we kick
729                            off the reimaging job, at which point we record
730                            artifact_end_time. Recorded in dynamic_suite.py.
731    @var suite_start_time: the time the suite started.
732    @var tests_start_time: the time the first test started running.
733    @var tests_end_time: the time the last test finished running.
734    """
735
736    def __init__(self, suite_job_id):
737        self.suite_job_id = suite_job_id
738        # Timings related to staging artifacts on devserver.
739        self.download_start_time = None
740        self.payload_end_time = None
741        self.artifact_end_time = None
742
743        # The test_start_time, but taken off the view that corresponds to the
744        # suite instead of an individual test.
745        self.suite_start_time = None
746
747        # Earliest and Latest tests in the set of TestViews passed to us.
748        self.tests_start_time = None
749        self.tests_end_time = None
750
751
752    def RecordTiming(self, view):
753        """Given a test report view, extract and record pertinent time info.
754
755        get_detailed_test_views() returns a list of entries that provide
756        info about the various parts of a suite run.  This method can take
757        any one of these entries and look up timestamp info we might want
758        and record it.
759
760        If timestamps are unavailable, datetime.datetime.min/max will be used.
761
762        @param view: A TestView object.
763        """
764        start_candidate = datetime.min
765        end_candidate = datetime.max
766        if view['test_started_time']:
767            start_candidate = time_utils.time_string_to_datetime(
768                    view['test_started_time'])
769        if view['test_finished_time']:
770            end_candidate = time_utils.time_string_to_datetime(
771                    view['test_finished_time'])
772
773        if view.get_testname() == TestView.SUITE_JOB:
774            self.suite_start_time = start_candidate
775        else:
776            self._UpdateFirstTestStartTime(start_candidate)
777            self._UpdateLastTestEndTime(end_candidate)
778        if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view:
779            keyvals = view['job_keyvals']
780            self.download_start_time = time_utils.time_string_to_datetime(
781                    keyvals.get(constants.DOWNLOAD_STARTED_TIME),
782                    handle_type_error=True)
783
784            self.payload_end_time = time_utils.time_string_to_datetime(
785                    keyvals.get(constants.PAYLOAD_FINISHED_TIME),
786                    handle_type_error=True)
787
788            self.artifact_end_time = time_utils.time_string_to_datetime(
789                    keyvals.get(constants.ARTIFACT_FINISHED_TIME),
790                    handle_type_error=True)
791
792
793    def _UpdateFirstTestStartTime(self, candidate):
794        """Update self.tests_start_time, iff candidate is an earlier time.
795
796        @param candidate: a datetime.datetime object.
797        """
798        if not self.tests_start_time or candidate < self.tests_start_time:
799            self.tests_start_time = candidate
800
801
802    def _UpdateLastTestEndTime(self, candidate):
803        """Update self.tests_end_time, iff candidate is a later time.
804
805        @param candidate: a datetime.datetime object.
806        """
807        if not self.tests_end_time or candidate > self.tests_end_time:
808            self.tests_end_time = candidate
809
810
811    def __str__(self):
812        return ('\n'
813                'Suite timings:\n'
814                'Downloads started at %s\n'
815                'Payload downloads ended at %s\n'
816                'Suite started at %s\n'
817                'Artifact downloads ended (at latest) at %s\n'
818                'Testing started at %s\n'
819                'Testing ended at %s\n' % (self.download_start_time,
820                                           self.payload_end_time,
821                                           self.suite_start_time,
822                                           self.artifact_end_time,
823                                           self.tests_start_time,
824                                           self.tests_end_time))
825
826
827def instance_for_pool(pool_name):
828    """
829    Return the hostname of the server that should be used to service a suite
830    for the specified pool.
831
832    @param pool_name: The pool (without 'pool:' to schedule the suite against.
833    @return: The correct host that should be used to service this suite run.
834    """
835    return CONFIG.get_config_value(
836            'POOL_INSTANCE_SHARDING', pool_name,
837            default=_DEFAULT_AUTOTEST_INSTANCE)
838
839
840class TestView(object):
841    """Represents a test view and provides a set of helper functions."""
842
843
844    SUITE_JOB = 'Suite job'
845
846
847    def __init__(self, view, afe_job, suite_name, build, user,
848                 solo_test_run=False):
849        """Init a TestView object representing a tko test view.
850
851        @param view: A dictionary representing a tko test view.
852        @param afe_job: An instance of frontend.afe.models.Job
853                        representing the job that kicked off the test.
854        @param suite_name: The name of the suite
855                           that the test belongs to.
856        @param build: The build for which the test is run.
857        @param user: The user for which the test is run.
858        @param solo_test_run: This is a solo test run not part of a suite.
859        """
860        self.view = view
861        self.afe_job = afe_job
862        self.suite_name = suite_name
863        self.build = build
864        self.is_suite_view = afe_job.parent_job is None and not solo_test_run
865        # This is the test name that will be shown in the output.
866        self.testname = None
867        self.user = user
868
869        # The case that a job was aborted before it got a chance to run
870        # usually indicates suite has timed out (unless aborted by user).
871        # In this case, the abort reason will be None.
872        # Update the reason with proper information.
873        if (self.is_relevant_suite_view() and
874                not self.get_testname() == self.SUITE_JOB and
875                self.view['status'] == 'ABORT' and
876                not self.view['reason']):
877            self.view['reason'] = 'Timed out, did not run.'
878
879
880    def __getitem__(self, key):
881        """Overload __getitem__ so that we can still use []
882
883        @param key: A key of the tko test view.
884
885        @returns: The value of an attribute in the view.
886
887        """
888        return self.view[key]
889
890
891    def __iter__(self):
892        """Overload __iter__ so that it supports 'in' operator."""
893        return iter(self.view)
894
895
896    def get_testname(self):
897        """Get test name that should be shown in the output.
898
899        Formalize the test_name we got from the test view.
900
901        Remove 'build/suite' prefix if any.
902
903        If one runs a test in control file via the following code,
904           job.runtest('my_Test', tag='tag')
905        for most of the cases, view['test_name'] would look like 'my_Test.tag'.
906        If this is the case, this method will just return the original
907        test name, i.e. 'my_Test.tag'.
908
909        There are four special cases.
910        1) A test view is for the suite job's SERVER_JOB.
911           In this case, this method will return 'Suite job'.
912
913        2) A test view is of a child job or a solo test run not part of a
914           suite, and for a SERVER_JOB or CLIENT_JOB.
915           In this case, we will take the job name, remove the build/suite
916           prefix from the job name, and append the rest to 'SERVER_JOB'
917           or 'CLIENT_JOB' as a prefix. So the names returned by this
918           method will look like:
919             'dummy_Pass_SERVER_JOB'
920             'dummy_Fail_SERVER_JOB'
921
922        3) A test view is of a suite job and its status is ABORT.
923           In this case, the view['test_name'] is the child job's name.
924           For instance,
925             'lumpy-release/R35-5712.0.0/dummy/dummy_Pass'
926             'lumpy-release/R35-5712.0.0/dummy/dummy_Fail'
927           The above names will be converted to the following:
928             'dummy_Pass'
929             'dummy_Fail'
930
931        4) A test view's status is of a suite job and its status is TEST_NA.
932           In this case, the view['test_name'] is the NAME field of the control
933           file. For instance,
934             'dummy_Pass'
935             'dummy_Fail'
936           This method will not modify these names.
937
938        @returns: Test name after normalization.
939
940        """
941        if self.testname is not None:
942            return self.testname
943
944        if (self.is_suite_view and
945                self.view['test_name'].startswith('SERVER_JOB')):
946            # Rename suite job's SERVER_JOB to 'Suite job'.
947            self.testname = self.SUITE_JOB
948            return self.testname
949
950        if (self.view['test_name'].startswith('SERVER_JOB') or
951                self.view['test_name'].startswith('CLIENT_JOB')):
952            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
953            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
954        else:
955            testname = self.view['test_name']
956        # Remove the build and suite name from testname if any.
957        self.testname = tools.get_test_name(
958                self.build, self.suite_name, testname)
959        return self.testname
960
961
962    def is_relevant_suite_view(self):
963        """Checks whether this is a suite view we should care about.
964
965        @returns: True if it is relevant. False otherwise.
966        """
967        return (self.get_testname() == self.SUITE_JOB or
968                (self.is_suite_view and
969                    not self.view['test_name'].startswith('CLIENT_JOB') and
970                    not self.view['subdir']))
971
972
973    def is_test(self):
974        """Return whether the view is for an actual test.
975
976        @returns True if the view is for an actual test.
977                 False if the view is for SERVER_JOB or CLIENT_JOB.
978
979        """
980        return not (self.view['test_name'].startswith('SERVER_JOB') or
981                self.view['test_name'].startswith('CLIENT_JOB'))
982
983
984    def is_retry(self):
985        """Check whether the view is for a retry.
986
987        @returns: True, if the view is for a retry; False otherwise.
988
989        """
990        return self.view['job_keyvals'].get('retry_original_job_id') is not None
991
992
993    def hit_timeout(self):
994        """Check whether the corresponding job has hit its own timeout.
995
996        Note this method should not be called for those test views
997        that belongs to a suite job and are determined as irrelevant
998        by is_relevant_suite_view.  This is because they are associated
999        to the suite job, whose job start/finished time make no sense
1000        to an irrelevant test view.
1001
1002        @returns: True if the corresponding afe job has hit timeout.
1003                  False otherwise.
1004        """
1005        if (self.is_relevant_suite_view() and
1006                self.get_testname() != self.SUITE_JOB):
1007            # Any relevant suite test view except SUITE_JOB
1008            # did not hit its own timeout because it was not ever run.
1009            return False
1010        start = (datetime.strptime(
1011                self.view['job_started_time'], time_utils.TIME_FMT)
1012                if self.view['job_started_time'] else None)
1013        end = (datetime.strptime(
1014                self.view['job_finished_time'], time_utils.TIME_FMT)
1015                if self.view['job_finished_time'] else None)
1016        if not start or not end:
1017            return False
1018        else:
1019            return ((end - start).total_seconds()/60.0
1020                        > self.afe_job.max_runtime_mins)
1021
1022
1023    def is_aborted(self):
1024        """Check if the view was aborted.
1025
1026        For suite job and child job test views, we check job keyval
1027        'aborted_by' and test status.
1028
1029        For relevant suite job test views, we only check test status
1030        because the suite job keyval won't make sense to individual
1031        test views.
1032
1033        @returns: True if the test was as aborted, False otherwise.
1034
1035        """
1036
1037        if (self.is_relevant_suite_view() and
1038                self.get_testname() != self.SUITE_JOB):
1039            return self.view['status'] == 'ABORT'
1040        else:
1041            return (bool(self.view['job_keyvals'].get('aborted_by')) and
1042                    self.view['status'] in ['ABORT', 'RUNNING'])
1043
1044
1045    def is_in_fail_status(self):
1046        """Check if the given test's status corresponds to a failure.
1047
1048        @returns: True if the test's status is FAIL or ERROR. False otherwise.
1049
1050        """
1051        # All the statuses tests can have when they fail.
1052        return self.view['status'] in ['FAIL', 'ERROR', 'ABORT']
1053
1054
1055    def is_provision(self):
1056        """Check whether this is a provision test."""
1057        return self.get_testname() == 'provision'
1058
1059
1060    def get_buildbot_link_reason(self):
1061        """Generate the buildbot link reason for the test.
1062
1063        @returns: A string representing the reason.
1064
1065        """
1066        return ('%s: %s' % (self.view['status'], self.view['reason'])
1067                if self.view['reason'] else self.view['status'])
1068
1069
1070    def get_job_id_owner_str(self):
1071        """Generate the job_id_owner string for a test.
1072
1073        @returns: A string which looks like 135036-username
1074
1075        """
1076        return '%s-%s' % (self.view['afe_job_id'], self.user)
1077
1078
1079    def get_bug_info(self, suite_job_keyvals):
1080        """Get the bug info from suite_job_keyvals.
1081
1082        If a bug has been filed for the test, its bug info (bug id and counts)
1083        will be stored in the suite job's keyvals. This method attempts to
1084        retrieve bug info of the test from |suite_job_keyvals|. It will return
1085        None if no bug info is found. No need to check bug info if the view is
1086        SUITE_JOB.
1087
1088        @param suite_job_keyvals: The job keyval dictionary of the suite job.
1089                All the bug info about child jobs are stored in
1090                suite job's keyvals.
1091
1092        @returns: None if there is no bug info, or a pair with the
1093                  id of the bug, and the count of the number of
1094                  times the bug has been seen.
1095
1096        """
1097        if self.get_testname() == self.SUITE_JOB:
1098            return None
1099        if (self.view['test_name'].startswith('SERVER_JOB') or
1100                self.view['test_name'].startswith('CLIENT_JOB')):
1101            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
1102            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
1103        else:
1104            testname = self.view['test_name']
1105
1106        return tools.get_test_failure_bug_info(
1107                suite_job_keyvals, self.view['afe_job_id'],
1108                testname)
1109
1110
1111    def should_display_buildbot_link(self):
1112        """Check whether a buildbot link should show for this view.
1113
1114        For suite job view, show buildbot link if it fails.
1115        For normal test view,
1116            show buildbot link if it is a retry
1117            show buildbot link if it hits its own timeout.
1118            show buildbot link if it fails. This doesn't
1119            include the case where it was aborted but has
1120            not hit its own timeout (most likely it was aborted because
1121            suite has timed out).
1122
1123        @returns: True if we should show the buildbot link.
1124                  False otherwise.
1125        """
1126        is_bad_status = (self.view['status'] != 'GOOD' and
1127                         self.view['status'] != 'TEST_NA')
1128        if self.get_testname() == self.SUITE_JOB:
1129            return is_bad_status
1130        else:
1131            if self.is_retry():
1132                return True
1133            if is_bad_status:
1134                return not self.is_aborted() or self.hit_timeout()
1135
1136
1137    def get_control_file_attributes(self):
1138        """Get the attributes from the control file of the test.
1139
1140        @returns: A list of test attribute or None.
1141        """
1142        control_file = self.afe_job.control_file
1143        attributes = None
1144        if control_file:
1145            cd = control_data.parse_control_string(control_file)
1146            attributes = list(cd.attributes)
1147        return attributes
1148
1149
1150    def override_afe_job_id(self, afe_job_id):
1151        """Overrides the AFE job id for the test.
1152
1153        @param afe_job_id: The new AFE job id to use.
1154        """
1155        self.view['afe_job_id'] = afe_job_id
1156
1157
1158def log_buildbot_links(log_func, links):
1159    """Output buildbot links to log.
1160
1161    @param log_func: Logging function to use.
1162    @param links: Iterable of LogLink instances.
1163    """
1164    for link in links:
1165        for generated_link in link.GenerateBuildbotLinks():
1166            log_func(generated_link)
1167        retry_link = link.GenerateRetryLink()
1168        if retry_link:
1169            log_func(retry_link)
1170        history_link = link.GenerateHistoryLink()
1171        if history_link:
1172            log_func(history_link)
1173
1174
1175class _ReturnCodeComputer(object):
1176    """This is responsible for returning the _ReturnResult for a suite."""
1177
1178    def __call__(self, test_views):
1179        """Compute the exit code based on test results."""
1180        result = _RETURN_RESULTS['ok']
1181
1182        for v in test_views:
1183            if v.get_testname() == TestView.SUITE_JOB:
1184                result |= self._get_suite_result(v)
1185            else:
1186                result |= self._get_test_result(v)
1187        return result
1188
1189    def _get_suite_result(self, test_view):
1190        """Return the _ReturnResult for the given suite job."""
1191        # The order of checking each case is important.
1192        if test_view.is_aborted() and test_view.hit_timeout():
1193            return _RETURN_RESULTS['suite_timeout']
1194        elif test_view.is_in_fail_status():
1195            return _RETURN_RESULTS['suite_failed']
1196        elif test_view['status'] == 'WARN':
1197            return _RETURN_RESULTS['suite_warning']
1198        else:
1199            return _RETURN_RESULTS['ok']
1200
1201    def _get_test_result(self, test_view):
1202        """Return the _ReturnResult for the given test job."""
1203        # The order of checking each case is important.
1204        if test_view.is_aborted() and test_view.is_relevant_suite_view():
1205            # The test was aborted before started
1206            # This gurantees that the suite has timed out.
1207            return _RETURN_RESULTS['test_aborted_prestart']
1208        elif test_view.is_aborted() and not test_view.hit_timeout():
1209            # The test was aborted, but
1210            # not due to a timeout. This is most likely
1211            # because the suite has timed out, but may
1212            # also because it was aborted by the user.
1213            # Since suite timing out is determined by checking
1214            # the suite job view, we simply ignore this view here.
1215            return _RETURN_RESULTS['test_aborted_mystery']
1216        elif test_view.is_in_fail_status():  # The test job failed
1217            if test_view.is_provision():
1218                return _RETURN_RESULTS['provision_failed']
1219            else:
1220                return _RETURN_RESULTS['test_failure']
1221        elif test_view['status'] == 'WARN':
1222            return _RETURN_RESULTS['test_warning']
1223        elif test_view.is_retry():
1224            # The test is a passing retry.
1225            return _RETURN_RESULTS['test_retry']
1226        else:
1227            return _RETURN_RESULTS['ok']
1228
1229
1230class _ProvisionReturnCodeComputer(_ReturnCodeComputer):
1231    """This is used for returning the _ReturnResult for provision suites."""
1232
1233    def __init__(self, num_required):
1234        """Initialize instance.
1235
1236        num_required is the number of passing provision jobs needed.
1237        """
1238        super(_ProvisionReturnCodeComputer, self).__init__()
1239        self._num_required = num_required
1240        self._num_successful = 0
1241
1242    def __call__(self, test_views):
1243        result = super(_ProvisionReturnCodeComputer, self).__call__(test_views)
1244        if self._num_successful >= self._num_required:
1245            logging.info('Return result upgraded from %r'
1246                         ' due to enough ok provisions',
1247                         result)
1248            return _RETURN_RESULTS['ok']
1249        else:
1250            return result
1251
1252    def _get_test_result(self, test_view):
1253        result = (super(_ProvisionReturnCodeComputer, self)
1254                  ._get_test_result(test_view))
1255        if result in {_RETURN_RESULTS[s] for s in ('ok', 'test_retry')}:
1256            self._num_successful += 1
1257        return result
1258
1259
1260class ResultCollector(object):
1261    """Collect test results of a suite or a single test run.
1262
1263    Once a suite job has finished, use this class to collect test results.
1264    `run` is the core method that is to be called first. Then the caller
1265    could retrieve information like return code, return message, is_aborted,
1266    and timings by accessing the collector's public attributes. And output
1267    the test results and links by calling the 'output_*' methods.
1268
1269    Here is a overview of what `run` method does.
1270
1271    1) Collect the suite job's results from tko_test_view_2.
1272    For the suite job, we only pull test views without a 'subdir'.
1273    A NULL subdir indicates that the test was _not_ executed. This could be
1274    that no child job was scheduled for this test or the child job got
1275    aborted before starts running.
1276    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
1277
1278    2) Collect the child jobs' results from tko_test_view_2.
1279    For child jobs, we pull all the test views associated with them.
1280    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
1281
1282    3) Generate web and buildbot links.
1283    4) Compute timings of the suite run.
1284    5) Compute the return code based on test results.
1285
1286    @var _instance_server: The hostname of the server that is used
1287                           to service the suite.
1288    @var _afe: The afe rpc client.
1289    @var _tko: The tko rpc client.
1290    @var _build: The build for which the suite is run,
1291                 e.g. 'lumpy-release/R35-5712.0.0'
1292    @var _suite_name: The suite name, e.g. 'bvt', 'dummy'.
1293    @var _suite_job_id: The job id of the suite for which we are going to
1294                        collect results.
1295    @var _original_suite_name: The suite name we record timing would be
1296                               different from _suite_name when running
1297                               suite_attr_wrapper.
1298    @var _return_code_function: Called to return what the overall result of
1299                                the suite is.
1300    @var _suite_views: A list of TestView objects, representing relevant
1301                       test views of the suite job.
1302    @var _child_views: A list of TestView objects, representing test views
1303                       of the child jobs.
1304    @var _test_views: A list of TestView objects, representing all test views
1305                      from _suite_views and _child_views.
1306    @var _web_links: A list of web links pointing to the results of jobs.
1307    @var buildbot_links: A list of buildbot links for non-passing tests.
1308    @var _solo_test_run: True if this is a single test run.
1309    @var return_result: The _ReturnResult of the suite run.
1310    @var is_aborted: Whether the suite was aborted or not.
1311                     True, False or None (aborting status is unknown yet)
1312    @var timings: A Timing object that records the suite's timings.
1313
1314    """
1315
1316
1317    def __init__(self, instance_server, afe, tko, build,
1318                 suite_name, suite_job_id, return_code_function,
1319                 original_suite_name=None,
1320                 user=None, solo_test_run=False):
1321        self._instance_server = instance_server
1322        self._afe = afe
1323        self._tko = tko
1324        self._build = build
1325        self._suite_name = suite_name
1326        self._suite_job_id = suite_job_id
1327        self._original_suite_name = original_suite_name or suite_name
1328        self._return_code_function = return_code_function
1329        self._suite_views = []
1330        self._child_views = []
1331        self._test_views = []
1332        self._retry_counts = {}
1333        self._missing_results = {}
1334        self._web_links = []
1335        self.buildbot_links = []
1336        self._num_child_jobs = 0
1337        self.return_result = None
1338        self.is_aborted = None
1339        self.timings = None
1340        self._user = user or getpass.getuser()
1341        self._solo_test_run = solo_test_run
1342
1343
1344    def _fetch_relevant_test_views_of_suite(self):
1345        """Fetch relevant test views of the suite job.
1346
1347        For the suite job, there will be a test view for SERVER_JOB, and views
1348        for results of its child jobs. For example, assume we've created
1349        a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail,
1350        dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while
1351        dummy_Path.bluetooth got TEST_NA as no duts have bluetooth.
1352        So the suite job's test views would look like
1353        _____________________________________________________________________
1354        test_idx| job_idx|test_name           |subdir      |afe_job_id|status
1355        10      | 1000   |SERVER_JOB          |----        |40        |GOOD
1356        11      | 1000   |dummy_Pass          |NULL        |40        |ABORT
1357        12      | 1000   |dummy_Fail.Fail     |41-onwer/...|40        |FAIL
1358        13      | 1000   |dummy_Fail.Error    |42-owner/...|40        |ERROR
1359        14      | 1000   |dummy_Pass.bluetooth|NULL        |40        |TEST_NA
1360
1361        For a suite job, we only care about
1362        a) The test view for the suite job's SERVER_JOB
1363        b) The test views for real tests without a subdir. A NULL subdir
1364           indicates that a test didn't get executed.
1365        So, for the above example, we only keep test views whose test_idxs
1366        are 10, 11, 14.
1367
1368        @returns: A list of TestView objects, representing relevant
1369                  test views of the suite job.
1370
1371        """
1372        suite_job = self._afe.get_jobs(id=self._suite_job_id)[0]
1373        views = self._tko.run(call='get_detailed_test_views',
1374                              afe_job_id=self._suite_job_id)
1375        relevant_views = []
1376        for v in views:
1377            v = TestView(v, suite_job, self._suite_name, self._build, self._user,
1378                         solo_test_run=self._solo_test_run)
1379            if v.is_relevant_suite_view():
1380                # If the test doesn't have results in TKO and is being
1381                # displayed in the suite view instead of the child view,
1382                # then afe_job_id is incorrect and from the suite.
1383                # Override it based on the AFE job id which was missing
1384                # results.
1385                # TODO: This is likely inaccurate if a test has multiple
1386                # tries which all fail TKO parse stage.
1387                if v['test_name'] in self._missing_results:
1388                    v.override_afe_job_id(
1389                            self._missing_results[v['test_name']][0])
1390                relevant_views.append(v)
1391        return relevant_views
1392
1393
1394    def _compute_retry_count(self, view):
1395        """Return how many times the test has been retried.
1396
1397        @param view: A TestView instance.
1398        @returns: An int value indicating the retry count.
1399
1400        """
1401        old_job = view['job_keyvals'].get('retry_original_job_id')
1402        count = 0
1403        while old_job:
1404            count += 1
1405            views = self._tko.run(
1406                call='get_detailed_test_views', afe_job_id=old_job)
1407            old_job = (views[0]['job_keyvals'].get('retry_original_job_id')
1408                       if views else None)
1409        return count
1410
1411
1412    def _fetch_test_views_of_child_jobs(self, jobs=None):
1413        """Fetch test views of child jobs.
1414
1415        @returns: A tuple (child_views, retry_counts, missing_results)
1416                  child_views is list of TestView objects, representing
1417                  all valid views.
1418                  retry_counts is a dictionary that maps test_idx to retry
1419                  counts. It only stores retry counts that are greater than 0.
1420                  missing_results is a dictionary that maps test names to
1421                  lists of job ids.
1422
1423        """
1424        child_views = []
1425        retry_counts = {}
1426        missing_results = {}
1427        child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id)
1428        if child_jobs:
1429            self._num_child_jobs = len(child_jobs)
1430        for job in child_jobs:
1431            views = [TestView(v, job, self._suite_name, self._build, self._user)
1432                     for v in self._tko.run(
1433                         call='get_detailed_test_views', afe_job_id=job.id,
1434                         invalid=0)]
1435            if len(views) == 0:
1436                missing_results.setdefault(job.name, []).append(job.id)
1437            contains_test_failure = any(
1438                    v.is_test() and v['status'] != 'GOOD' for v in views)
1439            for v in views:
1440                if (v.is_test() or
1441                        v['status'] != 'GOOD' and not contains_test_failure):
1442                    # For normal test view, just keep it.
1443                    # For SERVER_JOB or CLIENT_JOB, only keep it
1444                    # if it fails and no other test failure.
1445                    child_views.append(v)
1446                    retry_count = self._compute_retry_count(v)
1447                    if retry_count > 0:
1448                        retry_counts[v['test_idx']] = retry_count
1449        return child_views, retry_counts, missing_results
1450
1451
1452    def _generate_web_and_buildbot_links(self):
1453        """Generate web links and buildbot links."""
1454        # TODO(fdeng): If a job was aborted before it reaches Running
1455        # state, we read the test view from the suite job
1456        # and thus this method generates a link pointing to the
1457        # suite job's page for the aborted job. Need a fix.
1458        self._web_links = []
1459        self.buildbot_links = []
1460
1461        # Bug info are stored in the suite job's keyvals.
1462        if self._solo_test_run:
1463            suite_job_keyvals = {}
1464        elif not self._suite_views:
1465            suite_job_keyvals = {}
1466        else:
1467            suite_job_keyvals = self._suite_views[0]['job_keyvals']
1468
1469        for v in self._test_views:
1470            retry_count = self._retry_counts.get(v['test_idx'], 0)
1471            bug_info = v.get_bug_info(suite_job_keyvals)
1472            job_id_owner = v.get_job_id_owner_str()
1473            link = LogLink(
1474                    anchor=v.get_testname(),
1475                    server=self._instance_server,
1476                    job_string=job_id_owner,
1477                    bug_info=bug_info, retry_count=retry_count,
1478                    testname=v.get_testname(),
1479                    sponge_url=suite_job_keyvals.get('sponge_url'))
1480            self._web_links.append(link)
1481
1482            if v.should_display_buildbot_link():
1483                link.reason = v.get_buildbot_link_reason()
1484                self.buildbot_links.append(link)
1485
1486
1487    def _record_timings(self):
1488        """Record suite timings."""
1489        self.timings = Timings(self._suite_job_id)
1490        for v in self._test_views:
1491            self.timings.RecordTiming(v)
1492
1493
1494    def _compute_return_code(self):
1495        """Compute the exit code based on test results."""
1496        self.return_result = self._return_code_function(self._test_views)
1497
1498
1499    def _make_test_results(self):
1500        """Make TestResults for collected tests.
1501
1502        @returns: List of TestResult instances.
1503        """
1504        test_results = []
1505        for test_view in self._test_views:
1506            test_result = TestResult(
1507                test_view=test_view,
1508                retry_count=self._retry_counts.get(test_view['test_idx'], 0))
1509            test_results.append(test_result)
1510        return test_results
1511
1512
1513    def output_results(self):
1514        """Output test results, timings and web links."""
1515        # Output test results
1516        test_results = self._make_test_results()
1517        if len(test_results) == 0:
1518            max_name_length = 0
1519        else:
1520            max_name_length = max(len(t.name) for t in test_results)
1521        for test_result in test_results:
1522            test_result.log_using(logging.info, max_name_length + 3)
1523        # Output suite timings
1524        logging.info(self.timings)
1525        # Output links to test logs
1526        logging.info('\nLinks to test logs:')
1527        for link in self._web_links:
1528            logging.info(link.text_link)
1529        logging.info('\n')
1530
1531
1532    def get_results_dict(self):
1533        """Write test results, timings and web links into a dict.
1534
1535        @returns: A dict of results in the format like:
1536                  {
1537                  'tests': {
1538                        'test_1': {'status': 'PASSED', 'attributes': [1,2], ...}
1539                        'test_2': {'status': 'FAILED', 'attributes': [1],...}
1540                  }
1541                  'suite_timings': {
1542                        'download_start': '1998-07-17 00:00:00',
1543                        'payload_download_end': '1998-07-17 00:00:05',
1544                        ...
1545                  }
1546                  }
1547        """
1548        output_dict = {}
1549        tests_dict = output_dict.setdefault('tests', {})
1550        for v in self._test_views:
1551            test_name = v.get_testname()
1552            test_info = tests_dict.setdefault(test_name, {})
1553            test_info.update({
1554                'status': v['status'],
1555                'attributes': v.get_control_file_attributes() or list(),
1556                'reason': v['reason'],
1557                'retry_count': self._retry_counts.get(v['test_idx'], 0),
1558                })
1559            # For aborted test, the control file will not be parsed and thus
1560            # fail to get the attributes info. Therefore, the subsystems the
1561            # abort test testing will be missing. For this case, we will assume
1562            # the aborted test will test all subsystems, set subsystem:default.
1563            if (test_info['status'] == 'ABORT' and
1564                not any('subsystem:' in a for a in test_info['attributes'])):
1565                test_info['attributes'].append('subsystem:default')
1566
1567        # Write the links to test logs into the |tests_dict| of |output_dict|.
1568        # For test whose status is not 'GOOD', the link is also buildbot_link.
1569        for link in self._web_links:
1570            test_name = link.anchor.strip()
1571            test_info = tests_dict.get(test_name)
1572            if test_info:
1573                test_info['link_to_logs'] = link.url
1574                test_info['sponge_url'] = link.sponge_url
1575                # Write the retry dashboard link into the dict.
1576                if link in self.buildbot_links and link.testname:
1577                    test_info['retry_dashboard_link'] \
1578                        = reporting_utils.link_retry_url(link.testname)
1579                    # Always write the wmatrix link for compatibility.
1580                    test_info['wmatrix_link'] \
1581                        = reporting_utils.link_wmatrix_retry_url(link.testname)
1582                # Write the bug url into the dict.
1583                if link.bug_id:
1584                    test_info['bug_url'] = link.bug_url
1585
1586        # Write the suite timings into |output_dict|
1587        timings = self.timings
1588        if timings is not None:
1589            time_dict = output_dict.setdefault('suite_timings', {})
1590            time_dict.update({
1591                'download_start' : str(timings.download_start_time),
1592                'payload_download_end' : str(timings.payload_end_time),
1593                'suite_start' : str(timings.suite_start_time),
1594                'artifact_download_end' : str(timings.artifact_end_time),
1595                'tests_start' : str(timings.tests_start_time),
1596                'tests_end' : str(timings.tests_end_time),
1597                })
1598
1599        output_dict['suite_job_id'] = self._suite_job_id
1600
1601        return output_dict
1602
1603
1604    def run(self):
1605        """Collect test results.
1606
1607        This method goes through the following steps:
1608            Fetch relevent test views of the suite job.
1609            Fetch test views of child jobs
1610            Check whether the suite was aborted.
1611            Generate links.
1612            Calculate suite timings.
1613            Compute return code based on the test result.
1614
1615        """
1616        if self._solo_test_run:
1617            self._test_views, self._retry_counts, self._missing_results = (
1618                  self._fetch_test_views_of_child_jobs(
1619                          jobs=self._afe.get_jobs(id=self._suite_job_id)))
1620        else:
1621            self._child_views, self._retry_counts, self._missing_results = (
1622                    self._fetch_test_views_of_child_jobs())
1623            self._suite_views = self._fetch_relevant_test_views_of_suite()
1624            self._test_views = self._suite_views + self._child_views
1625        # For hostless job in Starting status, there is no test view associated.
1626        # This can happen when a suite job in Starting status is aborted. When
1627        # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone,
1628        # max_jobs_started_per_cycle, a suite job can stays in Starting status.
1629        if not self._test_views:
1630            self.return_result = _RETURN_RESULTS['test_views_missing']
1631            return
1632        self.is_aborted = any([view['job_keyvals'].get('aborted_by')
1633                               for view in self._suite_views])
1634        self._generate_web_and_buildbot_links()
1635        self._record_timings()
1636        self._compute_return_code()
1637
1638
1639    def gather_timing_stats(self):
1640        """Collect timing related statistics."""
1641        # Record suite runtime in metadata db.
1642        # Some failure modes can leave times unassigned, report sentinel value
1643        # in that case.
1644        runtime_in_secs = -1
1645        if (self.timings.tests_end_time is not None and
1646            self.timings.suite_start_time is not None):
1647            runtime_in_secs = (self.timings.tests_end_time -
1648                    self.timings.suite_start_time).total_seconds()
1649
1650
1651def _make_child_dependencies(options):
1652    """Creates a list of extra dependencies for child jobs.
1653
1654    @param options: Parsed arguments to run_suite.
1655
1656    @returns: A list of label strings if any dependencies should be added. None
1657            otherwise.
1658    """
1659    if not options.model:
1660        return ()
1661    return ('model:%s' % options.model,)
1662
1663
1664@retry.retry(error.StageControlFileFailure, timeout_min=10)
1665def create_suite(afe, options):
1666    """Create a suite with retries.
1667
1668    @param afe: The afe object to insert the new suite job into.
1669    @param options: The options to use in creating the suite.
1670
1671    @return: The afe_job_id of the new suite job.
1672    """
1673    logging.info('%s Submitted create_suite_job rpc',
1674                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1675
1676    # TODO(crbug.com/763207): This is to support calling old moblab RPC
1677    # with ToT code.  This does not need to be supported after M62.
1678    if options.oldrpc:
1679        suite_args = options.suite_args
1680        if 'tests' in suite_args:
1681            # This is for test_that_wrapper
1682            suite_args = ' '.join([':lab:'] + suite_args['tests'])
1683        else:
1684            # This is for suite_attr_wrapper
1685            suite_args = repr(suite_args)
1686        options.suite_args = suite_args
1687
1688    return afe.run(
1689        'create_suite_job',
1690        name=options.name,
1691        board=options.board,
1692        builds=suite_common.make_builds_from_options(options),
1693        test_source_build=options.test_source_build,
1694        check_hosts=not options.no_wait,
1695        pool=options.pool,
1696        file_bugs=options.file_bugs,
1697        priority=options.priority,
1698        suite_args=options.suite_args,
1699        wait_for_results=not options.no_wait,
1700        timeout_mins=options.timeout_mins + options.delay_minutes,
1701        max_runtime_mins=options.max_runtime_mins + options.delay_minutes,
1702        job_retry=options.retry,
1703        max_retries=options.max_retries,
1704        suite_min_duts=options.suite_min_duts,
1705        offload_failures_only=options.offload_failures_only,
1706        run_prod_code=options.run_prod_code,
1707        delay_minutes=options.delay_minutes,
1708        job_keyvals=options.job_keyvals,
1709        test_args=options.test_args,
1710        child_dependencies=options.child_dependencies,
1711    )
1712
1713
1714def _run_suite(options):
1715    """
1716    run_suite script without exception handling.
1717
1718    @param options: The parsed options.
1719
1720    @returns: A tuple contains the return_code of run_suite and the dictionary
1721              of the output.
1722
1723    """
1724    # If indicate to use the new style suite control file, convert the args
1725    if options.use_suite_attr:
1726        options = change_options_for_suite_attr(options)
1727
1728    log_name = _get_log_name(options)
1729    utils.setup_logging(logfile=log_name)
1730
1731    if not options.bypass_labstatus and not options.web:
1732        utils.check_lab_status(options.build)
1733
1734    afe = _create_afe(options)
1735    instance_server = afe.server
1736
1737    rpc_helper = diagnosis_utils.RPCHelper(afe)
1738    is_real_time = True
1739    if options.mock_job_id:
1740        job_id = int(options.mock_job_id)
1741        existing_job = afe.get_jobs(id=job_id, finished=True)
1742        if existing_job:
1743            is_real_time = False
1744        else:
1745            existing_job = afe.get_jobs(id=job_id)
1746        if existing_job:
1747            job_created_on = time_utils.date_string_to_epoch_time(
1748                    existing_job[0].created_on)
1749        else:
1750            raise utils.TestLabException('Failed to retrieve job: %d' % job_id)
1751    else:
1752        try:
1753            rpc_helper.check_dut_availability(options.dependencies,
1754                                              options.minimum_duts,
1755                                              options.skip_duts_check)
1756            job_id = create_suite(afe, options)
1757            job_created_on = time.time()
1758        except (error.CrosDynamicSuiteException,
1759                error.RPCException, proxy.JSONRPCException) as e:
1760            logging.exception('Error Message: %s', e)
1761            return run_suite_common.SuiteResult(
1762                    run_suite_common.RETURN_CODES.INFRA_FAILURE,
1763                    {'return_message': str(e)})
1764        except AttributeError as e:
1765            logging.exception('Error Message: %s', e)
1766            return run_suite_common.SuiteResult(
1767                    run_suite_common.RETURN_CODES.INVALID_OPTIONS)
1768
1769    job_timer = diagnosis_utils.JobTimer(
1770            job_created_on, float(options.timeout_mins))
1771    job_url = reporting_utils.link_job(job_id,
1772                                       instance_server=instance_server)
1773    logging.info('%s Created suite job: %s',
1774                 job_timer.format_time(job_timer.job_created_time),
1775                 job_url)
1776    logging.info(annotations.StepLink(
1777        text='Link to suite',
1778        url=job_url))
1779
1780    if options.create_and_return:
1781        msg = '--create_and_return was specified, terminating now.'
1782        logging.info(msg)
1783        return run_suite_common.SuiteResult(
1784                run_suite_common.RETURN_CODES.OK,
1785                {'return_message': msg})
1786
1787    if options.no_wait:
1788        return _handle_job_nowait(job_id, options, instance_server)
1789    else:
1790        return _handle_job_wait(afe, job_id, options, job_timer, is_real_time)
1791
1792
1793def _get_log_name(options):
1794    """Return local log file's name.
1795
1796    @param options:         Parsed options.
1797
1798    @return log_name, a string file name.
1799    """
1800    if options.require_logfile:
1801        # options.build is verified to exist in verify_options.
1802        # convert build name from containing / to containing only _.
1803        log_name = 'run_suite-%s.log' % options.build.replace('/', '_')
1804        log_dir = os.path.join(common.autotest_dir, 'logs')
1805        if os.path.exists(log_dir):
1806            log_name = os.path.join(log_dir, log_name)
1807
1808        return log_name
1809    else:
1810        return None
1811
1812
1813def _create_afe(options):
1814    """Return an afe instance based on options.
1815
1816    @param options          Parsed options.
1817
1818    @return afe, an AFE instance.
1819    """
1820    instance_server = (options.web if options.web else
1821                       instance_for_pool(options.pool))
1822    afe = frontend_wrappers.RetryingAFE(server=instance_server,
1823                                        timeout_min=options.afe_timeout_mins,
1824                                        delay_sec=options.delay_sec)
1825    logging.info('Autotest instance created: %s', instance_server)
1826    return afe
1827
1828
1829def _handle_job_wait(afe, job_id, options, job_timer, is_real_time):
1830    """Handle suite job synchronously.
1831
1832    @param afe              AFE instance.
1833    @param job_id           Suite job id.
1834    @param options          Parsed options.
1835    @param job_timer        JobTimer for suite job.
1836    @param is_real_time     Whether or not to handle job timeout.
1837
1838    @return SuiteResult of suite job.
1839    """
1840    rpc_helper = diagnosis_utils.RPCHelper(afe)
1841    instance_server = afe.server
1842    while not afe.get_jobs(id=job_id, finished=True):
1843        _poke_buildbot_with_output(afe, job_id, job_timer)
1844        if job_timer.debug_output_timer.poll():
1845            logging.info('The suite job has another %s till timeout.',
1846                         job_timer.timeout_hours - job_timer.elapsed_time())
1847        time.sleep(10)
1848    logging.info('%s Suite job is finished.',
1849                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1850    # For most cases, ResultCollector should be able to determine whether
1851    # a suite has timed out by checking information in the test view.
1852    # However, occationally tko parser may fail on parsing the
1853    # job_finished time from the job's keyval file. So we add another
1854    # layer of timeout check in run_suite. We do the check right after
1855    # the suite finishes to make it as accurate as possible.
1856    # There is a minor race condition here where we might have aborted
1857    # for some reason other than a timeout, and the job_timer thinks
1858    # it's a timeout because of the jitter in waiting for results.
1859    # The consequence would be that run_suite exits with code
1860    # SUITE_TIMEOUT while it should  have returned INFRA_FAILURE
1861    # instead, which should happen very rarely.
1862    # Note the timeout will have no sense when using -m option.
1863    is_suite_timeout = job_timer.is_suite_timeout()
1864
1865    # Extract the original suite name to record timing.
1866    original_suite_name = get_original_suite_name(options.name,
1867                                                  options.suite_args)
1868    # Start collecting test results.
1869    logging.info('%s Start collecting test results and dump them to json.',
1870                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1871    TKO = frontend_wrappers.RetryingTKO(server=instance_server,
1872                                        timeout_min=options.afe_timeout_mins,
1873                                        delay_sec=options.delay_sec)
1874    # TODO(crbug.com/672348): It needs to be possible for provision
1875    # suite to pass if only a few tests fail.  Otherwise, a single
1876    # failing test will be reported as failure even if the suite reports
1877    # success.
1878    if options.name == _PROVISION_SUITE:
1879        # TODO(crbug.com/672348): Creating the suite job requires that
1880        # suite_args contains num_required.
1881        return_code_function = _ProvisionReturnCodeComputer(
1882            num_required=options.suite_args['num_required'])
1883    else:
1884        return_code_function = _ReturnCodeComputer()
1885    collector = ResultCollector(instance_server=instance_server,
1886                                afe=afe, tko=TKO, build=options.build,
1887                                suite_name=options.name,
1888                                suite_job_id=job_id,
1889                                return_code_function=return_code_function,
1890                                original_suite_name=original_suite_name)
1891    collector.run()
1892    # Dump test outputs into json.
1893    output_dict = collector.get_results_dict()
1894    output_dict['autotest_instance'] = instance_server
1895    if not options.json_dump:
1896        collector.output_results()
1897    result = collector.return_result
1898    if is_real_time:
1899        # Do not record stats if the suite was aborted (either by a user
1900        # or through the golo rpc).
1901        # Also do not record stats if is_aborted is None, indicating
1902        # aborting status is unknown yet.
1903        if collector.is_aborted == False:
1904            logging.info('%s Gathering timing stats for the suite job.',
1905                         diagnosis_utils.JobTimer.format_time(datetime.now()))
1906            collector.gather_timing_stats()
1907
1908        if collector.is_aborted == True and is_suite_timeout:
1909            # There are two possible cases when a suite times out.
1910            # 1. the suite job was aborted due to timing out
1911            # 2. the suite job succeeded, but some child jobs
1912            #    were already aborted before the suite job exited.
1913            # The case 2 was handled by ResultCollector,
1914            # here we handle case 1.
1915            result |= _RETURN_RESULTS['suite_timeout']
1916        logging.info('\n %s Attempting to display pool info: %s',
1917                     diagnosis_utils.JobTimer.format_time(datetime.now()),
1918                     options.pool)
1919        try:
1920            # Add some jitter to make up for any latency in
1921            # aborting the suite or checking for results.
1922            cutoff = job_timer.timeout_hours + timedelta(hours=0.3)
1923            rpc_helper.diagnose_pool(options.dependencies, cutoff)
1924        except proxy.JSONRPCException:
1925            logging.warning('Unable to display pool info.')
1926
1927    # And output return message.
1928    if result.message:
1929        logging.info('Reason: %s', result.message)
1930
1931    logging.info('\n %s Output below this line is for buildbot consumption:',
1932                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1933    log_buildbot_links(logging.info, collector.buildbot_links)
1934    return result.suite_result(output_dict)
1935
1936
1937def _handle_job_nowait(job_id, options, instance_server):
1938    """Handle suite job asynchronously.
1939
1940    @param job_id           Suite job id.
1941    @param options          Parsed options.
1942    @param instance_server  Autotest instance hostname.
1943
1944    @return SuiteResult of suite job.
1945    """
1946    logging.info('Created suite job: %r', job_id)
1947    link = LogLink(options.name, instance_server,
1948                   '%s-%s' % (job_id, getpass.getuser()))
1949    for generate_link in link.GenerateBuildbotLinks():
1950        logging.info(generate_link)
1951    logging.info('--no_wait specified; Exiting.')
1952    return run_suite_common.SuiteResult(
1953            run_suite_common.RETURN_CODES.OK,
1954            {'return_message': '--no_wait specified; Exiting.'})
1955
1956
1957def _should_run(options):
1958    """Check whether the suite should be run based on lab/job status checking.
1959
1960    @param options          Parsed options.
1961    """
1962    try:
1963        site_utils.check_lab_status(options.test_source_build)
1964    except site_utils.TestLabException as ex:
1965        logging.exception('Lab is closed or build is blocked. Skipping '
1966                          'suite %s, board %s, build %s:  %s',
1967                          options.name, options.board,
1968                          options.test_source_build, str(ex))
1969        return False
1970
1971    start_time = str(datetime.now() -
1972                     timedelta(days=_SEARCH_JOB_MAX_DAYS))
1973    afe = _create_afe(options)
1974    afe_jobs = afe.get_jobs(
1975            name__istartswith=options.test_source_build,
1976            name__iendswith='control.'+options.name,
1977            created_on__gte=start_time,
1978            min_rpc_timeout=_MIN_RPC_TIMEOUT)
1979    if options.model:
1980        model_tag = 'model:%s' % options.model
1981        filtered_jobs = [j for j in afe_jobs if model_tag in j.control_file]
1982    else:
1983        filtered_jobs = afe_jobs
1984
1985    if filtered_jobs:
1986        logging.info('Found duplicate suite %s scheduled in past.',
1987                     filtered_jobs)
1988        return False
1989
1990    return True
1991
1992
1993def _poke_buildbot_with_output(afe, job_id, job_timer):
1994    """Poke buildbot so it doesn't timeout from silence.
1995
1996    @param afe              AFE instance.
1997    @param job_id           Suite job id.
1998    @param job_timer        JobTimer for suite job.
1999    """
2000    rpc_helper = diagnosis_utils.RPCHelper(afe)
2001    # Note that this call logs output, preventing buildbot's
2002    # 9000 second silent timeout from kicking in. Let there be no
2003    # doubt, this is a hack. The timeout is from upstream buildbot and
2004    # this is the easiest work around.
2005    if job_timer.first_past_halftime():
2006        rpc_helper.diagnose_job(job_id, afe.server)
2007
2008
2009
2010def _run_task(options):
2011    """Perform this script's function minus setup.
2012
2013    Boilerplate like argument parsing, logging, output formatting happen
2014    elsewhere.
2015
2016    Returns a SuiteResult instance.
2017
2018    TODO(ayatane): The try/except should be moved into _run_suite().
2019    Good luck trying to figure out which function calls are supposed to
2020    raise which of the exceptions.
2021    """
2022    try:
2023        return _run_suite(options)
2024    except diagnosis_utils.DUTsNotAvailableError as e:
2025        result = run_suite_common.SuiteResult(
2026            run_suite_common.RETURN_CODES.BOARD_NOT_AVAILABLE,
2027            {'return_message': 'Skipping testing: %s' % e.message})
2028        logging.info(result.output_dict['return_message'])
2029        return result
2030    except utils.TestLabException as e:
2031        result = run_suite_common.SuiteResult(
2032            run_suite_common.RETURN_CODES.INFRA_FAILURE,
2033            {'return_message': 'TestLabException: %s' % e})
2034        logging.exception(result.output_dict['return_message'])
2035        return result
2036
2037
2038class _ExceptionHandler(object):
2039    """Global exception handler replacement."""
2040
2041    def __init__(self, dump_json):
2042        """Initialize instance.
2043
2044        @param dump_json: Whether to print a JSON dump of the result dict to
2045                          stdout.
2046        """
2047        self._should_dump_json = dump_json
2048
2049    def __call__(self, exc_type, value, traceback):
2050        if self._should_dump_json:
2051            run_suite_common.dump_json(
2052                    {'return_message': ('Unhandled run_suite exception: %s'
2053                                        % value)})
2054        sys.exit(run_suite_common.RETURN_CODES.INFRA_FAILURE)
2055
2056
2057def _check_if_use_skylab(options):
2058    """Detect whether to run suite in skylab."""
2059    if not _ENABLE_RUN_SUITE_TRAMPOLINE:
2060        logging.info('trampoline to skylab is not enabled.')
2061        return False
2062
2063    task_info = 'suite:%s, board:%s, model:%s, pool:%s' % (
2064            options.name, options.board, options.model, options.pool)
2065    ctx = gs.GSContext()
2066    with osutils.TempDir(prefix='trampoline_') as tempdir:
2067        temp_file = os.path.join(tempdir, _MIGRATION_CONFIG_FILE)
2068        ctx.Copy(_TRAMPOLINE_CONFIG, temp_file)
2069        _migration_config = config_reader.MigrationConfig(
2070                config_reader.ConfigReader(temp_file))
2071
2072        logging.info('Checking whether to run in skylab: Task(%s)', task_info)
2073        if skylab.should_run_in_skylab(_migration_config,
2074                                       options.board,
2075                                       options.model,
2076                                       options.name,
2077                                       options.pool):
2078            logging.info('Task (%s) Should run in skylab', task_info)
2079            return True
2080
2081    logging.info('Task (%s) Should run in autotest', task_info)
2082    return False
2083
2084
2085def _run_with_skylab(options):
2086    """Run suite inside skylab."""
2087    # TODO(xixuan): Implement running suite in skylab.
2088    return _RETURN_RESULTS['ok']
2089
2090
2091def _run_with_autotest(options):
2092    """Run suite inside autotest."""
2093    if options.pre_check and not _should_run(options):
2094        logging.info('Suite %s-%s is terminated: Lab is closed, OR build is '
2095                     'blocked, OR this suite has already been kicked off '
2096                     'once in past %d days.',
2097                     options.test_source_build, options.name,
2098                     _SEARCH_JOB_MAX_DAYS)
2099        result = run_suite_common.SuiteResult(
2100            run_suite_common.RETURN_CODES.ERROR,
2101            {'return_message': ("Lab is closed OR other reason"
2102                                " (see code, it's complicated)")})
2103    else:
2104        result = _run_task(options)
2105
2106    if options.json_dump:
2107        run_suite_common.dump_json(result.output_dict)
2108
2109    return result
2110
2111
2112def main():
2113    """Entry point."""
2114    utils.verify_not_root_user()
2115
2116    parser = make_parser()
2117    options = parser.parse_args()
2118    if options.do_nothing:
2119        return 0
2120
2121    sys.exceptionhandler = _ExceptionHandler(dump_json=options.json_dump)
2122    if options.json_dump:
2123        logging.disable(logging.CRITICAL)
2124
2125    options_okay = verify_and_clean_options(options)
2126    # Set StreamHandler first to capture error messages if suite is not run.
2127    utils.setup_logging()
2128    if not options_okay:
2129        parser.print_help()
2130        result = run_suite_common.SuiteResult(
2131                run_suite_common.RETURN_CODES.INVALID_OPTIONS)
2132    else:
2133        if _check_if_use_skylab(options):
2134            result = _run_with_skylab(options)
2135        else:
2136            result = _run_with_autotest(options)
2137
2138    logging.info('Will return from run_suite with status: %s',
2139                  run_suite_common.RETURN_CODES.get_string(result.return_code))
2140    return result.return_code
2141
2142
2143if __name__ == "__main__":
2144    sys.exit(main())
2145