• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7
8"""Tool for running suites of tests and waiting for completion.
9
10The desired test suite will be scheduled with autotest. By default,
11this tool will block until the job is complete, printing a summary
12at the end.  Error conditions result in exceptions.
13
14This is intended for use only with Chrome OS test suits that leverage the
15dynamic suite infrastructure in server/cros/dynamic_suite.py.
16
17This script exits with one of the following codes:
180 - OK: Suite finished successfully
191 - ERROR: Test(s) failed, or hits its own timeout
202 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out.
213 - INFRA_FAILURE: Infrastructure related issues, e.g.
22    * Lab is down
23    * Too many duts (defined as a constant) in repair failed status
24    * Suite job issues, like bug in dynamic suite,
25      user aborted the suite, lose a drone/all devservers/rpc server,
26      0 tests ran, etc.
27    * provision failed
28      TODO(fdeng): crbug.com/413918, reexamine treating all provision
29                   failures as INFRA failures.
304 - SUITE_TIMEOUT: Suite timed out, some tests ran,
31    none failed by the time the suite job was aborted. This will cover,
32    but not limited to, the following cases:
33    * A devserver failure that manifests as a timeout
34    * No DUTs available midway through a suite
35    * Provision/Reset/Cleanup took longer time than expected for new image
36    * A regression in scheduler tick time.
375- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool.
386- INVALID_OPTIONS: If options are not valid.
39"""
40
41import argparse
42import ast
43import collections
44from collections import namedtuple
45from datetime import datetime
46from datetime import timedelta
47import functools
48import getpass
49import json
50import logging
51import os
52import re
53import sys
54import time
55import warnings
56
57import common
58from chromite.lib import buildbot_annotations as annotations
59
60from autotest_lib.client.common_lib import control_data
61from autotest_lib.client.common_lib import error
62from autotest_lib.client.common_lib import global_config, enum
63from autotest_lib.client.common_lib import priorities
64from autotest_lib.client.common_lib import time_utils
65from autotest_lib.client.common_lib.cros import retry
66from autotest_lib.frontend.afe import rpc_client_lib
67from autotest_lib.frontend.afe.json_rpc import proxy
68from autotest_lib.server import site_utils
69from autotest_lib.server import utils
70from autotest_lib.server.cros import provision
71from autotest_lib.server.cros.dynamic_suite import constants
72from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
73from autotest_lib.server.cros.dynamic_suite import reporting_utils
74from autotest_lib.server.cros.dynamic_suite import tools
75from autotest_lib.site_utils import diagnosis_utils
76from autotest_lib.site_utils import job_overhead
77
78CONFIG = global_config.global_config
79
80_DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value(
81        'SERVER', 'hostname', type=str)
82_URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
83
84# Return code that will be sent back to autotest_rpc_server.py
85RETURN_CODES = enum.Enum(
86        'OK', 'ERROR', 'WARNING', 'INFRA_FAILURE', 'SUITE_TIMEOUT',
87        'BOARD_NOT_AVAILABLE', 'INVALID_OPTIONS')
88
89# Minimum RPC timeout setting for calls expected to take long time, e.g.,
90# create_suite_job. If default socket time (socket.getdefaulttimeout()) is
91# None or greater than this value, the default will be used.
92# The value here is set to be the same as the timeout for the RetryingAFE object
93# so long running RPCs can wait long enough before being aborted.
94_MIN_RPC_TIMEOUT = 600
95
96# Number of days back to search for existing job.
97_SEARCH_JOB_MAX_DAYS = 14
98
99_PROVISION_SUITE = 'provision'
100
101
102@functools.total_ordering
103class _ReturnResult(object):
104    """Represents overall result of run_suite operation.
105
106    _ReturnResult instances sort based on priority (the order in
107    _RETURN_RESULTS).
108
109    Furthermore, _ReturnResult instances can be combined by bitwise or
110    ("union"), which returns the instance with the higher priority
111    between the two (the instance with higher priority is a "superset"
112    of the other).
113
114    Do not create new instances of this; use _RETURN_RESULTS instead.
115    """
116
117    def __init__(self, return_code, message):
118        self.return_code = return_code
119        self.message = message
120
121    def __repr__(self):
122        return '<{cls} {key}, {this.return_code}, {this.message}>'.format(
123            cls=type(self).__name__,
124            key=self._getkey(),
125            this=self)
126
127    def __gt__(self, other):
128        if isinstance(other, type(self)):
129            return self._getkey() > other._getkey()
130        else:
131            return NotImplemented
132
133    def __eq__(self, other):
134        if isinstance(other, type(self)):
135            return (self.return_code == other.return_code
136                    and self.message == other.message)
137        else:
138            return NotImplemented
139
140    def __hash__(self):
141        return hash(self.return_code) ^ hash(self.message)
142
143    def __or__(self, other):
144        if isinstance(other, type(self)):
145            if self > other:
146                return self
147            else:
148                return other
149        else:
150            return NotImplemented
151
152    def _getkey(self):
153        """Return sort key."""
154        return _RETURN_RESULTS_LIST.index(self)
155
156    def suite_result(self, output_dict=None):
157        """Make a SuiteResult using this _ReturnResult.
158
159        @param output_dict: output_dict to merge into SuiteResult.
160        """
161        if output_dict is None:
162            output_dict = dict()
163        else:
164            output_dict = output_dict.copy()
165        if self.message:
166            output_dict['return_message'] = self.message
167        return SuiteResult(self.return_code, output_dict)
168
169
170_RETURN_RESULTS = collections.OrderedDict([
171    ('ok', _ReturnResult(RETURN_CODES.OK, '')),
172
173    ('test_warning', _ReturnResult(
174        RETURN_CODES.WARNING, 'Test job raised warning.')),
175    ('suite_warning', _ReturnResult(
176        RETURN_CODES.WARNING, 'Suite job raised warning.')),
177    ('test_retry', _ReturnResult(
178        RETURN_CODES.WARNING, 'Tests were retried.')),
179
180    ('test_aborted_prestart', _ReturnResult(
181        RETURN_CODES.SUITE_TIMEOUT,
182        'Tests were aborted before running; suite must have timed out.')),
183    # This really indicates a user action or an infra failure. But, suite
184    # timeouts cause similar fauilres in the individual tests, so we must
185    # classify these lower than suite_timeout. In case of a suite_timeout, the
186    # result from the suite job will promote the result to suite_timeout.
187    ('test_aborted_mystery',
188     _ReturnResult(
189             RETURN_CODES.SUITE_TIMEOUT,
190             'Tests were aborted after running, but before timeout; '
191             'Test was manually aborted or parsing results failed: '
192             'crbug.com/796348.')),
193    ('suite_timeout', _ReturnResult(
194        RETURN_CODES.SUITE_TIMEOUT, 'Suite job timed out.')),
195
196    ('test_views_missing', _ReturnResult(
197        RETURN_CODES.INFRA_FAILURE, 'No test views found.')),
198    ('suite_failed', _ReturnResult(
199        RETURN_CODES.INFRA_FAILURE, 'Suite job failed.')),
200    ('provision_failed', _ReturnResult(
201        RETURN_CODES.INFRA_FAILURE, 'Provisioning failed.')),
202
203    ('test_failure', _ReturnResult(
204        RETURN_CODES.ERROR, 'Tests failed.')),
205])
206_RETURN_RESULTS_LIST = list(_RETURN_RESULTS.values())
207
208
209def bool_str(x):
210    """Boolean string type for option arguments.
211
212    @param x: string representation of boolean value.
213
214    """
215    if x == 'True':
216        return True
217    elif x == 'False':
218        return False
219    else:
220        raise argparse.ArgumentTypeError(
221            '%s is not one of True or False' % (x,))
222
223
224def _get_priority_value(x):
225    """Convert a priority representation to its int value.
226
227    Priorities can be described either by an int value (possibly as a string)
228    or a name string.  This function coerces both forms to an int value.
229
230    This function is intended for casting command line arguments during
231    parsing.
232
233    @param x: priority value as an int, int string, or name string
234
235    @returns: int value of priority
236    """
237    try:
238        return int(x)
239    except ValueError:
240        try:
241            return priorities.Priority.get_value(x)
242        except AttributeError:
243            raise argparse.ArgumentTypeError(
244                'Unknown priority level %s.  Try one of %s.'
245                % (x, ', '.join(priorities.Priority.names)))
246
247
248def make_parser():
249    """Make ArgumentParser instance for run_suite.py."""
250    parser = argparse.ArgumentParser(
251        usage="%(prog)s [options]")
252    parser.add_argument("-b", "--board", dest="board")
253    parser.add_argument(
254            "--model",
255            help="The device model to run tests against. For non-unified "
256                 "builds, model and board are synonymous, but board is more "
257                 "accurate in some cases. Only pass this option if your build "
258                 "is a unified build.",
259    )
260    parser.add_argument("-i", "--build", dest="build")
261    parser.add_argument(
262        "-w", "--web", dest="web", default=None,
263        help="Address of a webserver to receive suite requests.")
264    parser.add_argument(
265        '--cheets_build', dest='cheets_build', default=None,
266        help='ChromeOS Android build to be installed on dut.')
267    parser.add_argument(
268        '--firmware_rw_build', dest='firmware_rw_build', default=None,
269        help='Firmware build to be installed in dut RW firmware.')
270    parser.add_argument(
271        '--firmware_ro_build', dest='firmware_ro_build', default=None,
272        help='Firmware build to be installed in dut RO firmware.')
273    parser.add_argument(
274        '--test_source_build', dest='test_source_build', default=None,
275        help=('Build that contains the test code, '
276              'e.g., it can be the value of `--build`, '
277              '`--firmware_rw_build` or `--firmware_ro_build` '
278              'arguments. Default is None, that is, use the test '
279              'code from `--build` (CrOS image)'))
280    #  This should just be a boolean flag, but the autotest "proxy" code
281    #  can't handle flags that don't take arguments.
282    parser.add_argument(
283        "-n", "--no_wait", dest="no_wait", default=False, type=bool_str,
284        help='Must pass "True" or "False" if used.')
285    # If you really want no pool, --pool="" will do it. USE WITH CARE.
286    parser.add_argument("-p", "--pool", dest="pool", default="suites")
287    parser.add_argument("-s", "--suite_name", dest="name")
288    parser.add_argument("-a", "--afe_timeout_mins", type=int,
289                        dest="afe_timeout_mins", default=30)
290    parser.add_argument("-t", "--timeout_mins", type=int,
291                        dest="timeout_mins", default=1440)
292    parser.add_argument("-x", "--max_runtime_mins", type=int,
293                        dest="max_runtime_mins", default=1440)
294    parser.add_argument("-d", "--delay_sec", type=int,
295                        dest="delay_sec", default=10)
296    parser.add_argument("-m", "--mock_job_id", dest="mock_job_id",
297                        help="Attach to existing job id for already running "
298                        "suite, and creates report.")
299    # NOTE(akeshet): This looks similar to --no_wait, but behaves differently.
300    # --no_wait is passed in to the suite rpc itself and affects the suite,
301    # while this does not.
302    parser.add_argument("-c", "--create_and_return", dest="create_and_return",
303                        action="store_true",
304                        help="Create the suite and print the job id, then "
305                        "finish immediately.")
306    parser.add_argument("-u", "--num", dest="num", type=int, default=None,
307                        help="Deprecated, does nothing.")
308    #  Same boolean flag issue applies here.
309    parser.add_argument(
310        "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str,
311        help=('File bugs on test failures. Must pass "True" or '
312              '"False" if used.'))
313    parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus",
314                        action="store_true", help='Bypass lab status check.')
315    # We allow either a number or a string for the priority.  This way, if you
316    # know what you're doing, one can specify a custom priority level between
317    # other levels.
318    parser.add_argument("-r", "--priority", dest="priority",
319                        type=_get_priority_value,
320                        default=priorities.Priority.DEFAULT,
321                        action="store",
322                        help="Priority of suite. Either numerical value, or "
323                        "one of (" + ", ".join(priorities.Priority.names)
324                        + ").")
325    parser.add_argument(
326        '--retry', dest='retry', default=False, type=bool_str, action='store',
327        help='Enable test retry.  Must pass "True" or "False" if used.')
328    parser.add_argument('--max_retries', dest='max_retries', default=None,
329                        type=int, action='store', help='Maximum retries'
330                        'allowed at suite level. No limit if not specified.')
331    parser.add_argument('--minimum_duts', dest='minimum_duts', type=int,
332                        default=0, action='store',
333                        help='Check that the pool has at least such many '
334                        'healthy machines, otherwise suite will not run. '
335                        'Default to 0.')
336    parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int,
337                        default=0, action='store',
338                        help='Preferred minimum number of machines. Scheduler '
339                        'will prioritize on getting such many machines for '
340                        'the suite when it is competing with another suite '
341                        'that has a higher priority but already got minimum '
342                        'machines it needs. Default to 0.')
343    parser.add_argument("--suite_args", dest="suite_args",
344                        type=ast.literal_eval,
345                        default=None, action="store",
346                        help="A dict of args passed to the suite control file.")
347    parser.add_argument('--offload_failures_only',
348                        dest='offload_failures_only', type=bool_str,
349                        action='store', default=False,
350                        help='Only enable gs_offloading for failed tests. '
351                        'Successful tests will be deleted. Must pass "True"'
352                        ' or "False" if used.')
353    parser.add_argument('--use_suite_attr', dest='use_suite_attr',
354                        action='store_true', default=False,
355                        help='Advanced. Run the suite based on ATTRIBUTES of '
356                        'control files, rather than SUITE.')
357    parser.add_argument('--json_dump', dest='json_dump', action='store_true',
358                        default=False,
359                        help='Dump the output of run_suite to stdout.')
360    parser.add_argument(
361        '--run_prod_code', dest='run_prod_code',
362        action='store_true', default=False,
363        help='Run the test code that lives in prod aka the test '
364        'code currently on the lab servers.')
365    parser.add_argument(
366        '--delay_minutes', type=int, default=0,
367        help=('Delay the creation of test jobs for a given '
368              'number of minutes. This argument can be used to '
369              'force provision jobs being delayed, which helps '
370              'to distribute loads across devservers.'))
371    parser.add_argument(
372        '--skip_duts_check', dest='skip_duts_check', action='store_true',
373        default=False, help='If True, skip minimum available DUTs check')
374    parser.add_argument(
375        '--job_keyvals', dest='job_keyvals', type=ast.literal_eval,
376        action='store', default=None,
377        help='A dict of job keyvals to be inject to suite control file')
378    parser.add_argument(
379        '--test_args', dest='test_args', type=ast.literal_eval,
380        action='store', default=None,
381        help=('A dict of args passed all the way to each individual test that '
382              'will be actually ran.'))
383    parser.add_argument(
384        '--require_logfile', action='store_true',
385        help=('Stream logs of run_suite.py to a local file named '
386              'run_suite-<build name>.log.'))
387
388    # Used for monitoring purposes, to measure no-op swarming proxy latency.
389    parser.add_argument('--do_nothing', action='store_true',
390                        help=argparse.SUPPRESS)
391
392    # Used when lab/job status checking is needed. Currently its only user is
393    # suite scheduler v2.
394    parser.add_argument(
395        '--pre_check', action='store_true',
396        help=('Check lab and job status before kicking off a suite. Used by '
397              'suite scheduler v2.'))
398
399    # TODO(crbug.com/763207): This is to support calling old moblab RPC
400    # with ToT code.  This does not need to be supported after M62.
401    parser.add_argument('--oldrpc', action='store_true',
402                        help='Use old AFE RPC.')
403
404    return parser
405
406
407def verify_and_clean_options(options):
408    """Verify the validity of options.
409
410    @param options: The parsed options to verify.
411
412    @returns: True if verification passes, False otherwise.
413
414    """
415    if options.mock_job_id and (
416            not options.build or not options.name or not options.board):
417        print ('When using -m, need to specify build, board and suite '
418               'name which you have used for creating the original job')
419        return False
420    else:
421        if not options.build:
422            print 'Need to specify which build to use'
423            return False
424        if not options.board:
425            print 'Need to specify board'
426            return False
427        if not options.name:
428            print 'Need to specify suite name'
429            return False
430    if options.num is not None:
431        warnings.warn('-u/--num option is deprecated; it does nothing.')
432    del options.num
433    if not options.retry and options.max_retries is not None:
434        print 'max_retries can only be used with --retry=True'
435        return False
436    if options.use_suite_attr and options.suite_args is not None:
437        print ('The new suite control file cannot parse the suite_args: %s.'
438               'Please not specify any suite_args here.' % options.suite_args)
439        return False
440    if options.no_wait and options.retry:
441        print 'Test retry is not available when using --no_wait=True'
442    # Default to use the test code in CrOS build.
443    if not options.test_source_build and options.build:
444        options.test_source_build = options.build
445    return True
446
447
448def change_options_for_suite_attr(options):
449    """Change options to be prepared to run the suite_attr_wrapper.
450
451    If specify 'use_suite_attr' from the cmd line, it indicates to run the
452    new style suite control file, suite_attr_wrapper. Then, change the
453    options.name to 'suite_attr_wrapper', change the options.suite_args to
454    include the arguments needed by suite_attr_wrapper.
455
456    @param options: The verified options.
457
458    @returns: The changed options.
459
460    """
461    # Convert the suite_name to attribute boolean expression.
462    if type(options.name) is str:
463        attr_filter_val = 'suite:%s' % options.name
464    else:
465        attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name])
466
467    # change the suite_args to be a dict of arguments for suite_attr_wrapper
468    # if suite_args is not None, store the values in 'other_args' of the dict
469    args_dict = {}
470    args_dict['attr_filter'] = attr_filter_val
471    options.suite_args = args_dict
472    options.name = 'suite_attr_wrapper'
473
474    return options
475
476
477class TestResult(object):
478
479    """Represents the result of a TestView."""
480
481    def __init__(self, test_view, retry_count=0):
482        """Initialize instance.
483
484        @param test_view: TestView instance.
485        @param retry_count: Retry count for test.  Optional.
486        """
487        self.name = test_view.get_testname()
488        self.status = test_view['status']
489        self.reason = test_view['reason']
490        self.retry_count = retry_count
491
492    _PRETTY_STATUS_MAP = {
493        'GOOD':    '[ PASSED ]',
494        'TEST_NA': '[  INFO  ]',
495    }
496
497    @property
498    def _pretty_status(self):
499        """Pretty status string."""
500        return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]')
501
502    def log_using(self, log_function, name_column_width):
503        """Log the test result using the given log function.
504
505        @param log_function: Log function to use.  Example: logging.info
506        @param name_column_width: Width of name column for formatting.
507        """
508        padded_name = self.name.ljust(name_column_width)
509        log_function('%s%s', padded_name, self._pretty_status)
510        if self.status != 'GOOD':
511            log_function('%s  %s: %s', padded_name, self.status, self.reason)
512        if self.retry_count > 0:
513            log_function('%s  retry_count: %s', padded_name, self.retry_count)
514
515
516def get_original_suite_name(suite_name, suite_args):
517    """Get the original suite name when running suite_attr_wrapper.
518
519    @param suite_name: the name of the suite launched in afe. When it is
520                       suite_attr_wrapper, the suite that actually running is
521                       specified in the suite_args.
522    @param suite_args: dict of suite args from argument parsing.
523
524    @returns: the original suite name.
525
526    """
527    if suite_name == 'suite_attr_wrapper':
528        attrs = suite_args.get('attr_filter', '')
529        suite_list = ([x[6:] for x in re.split('[() ]', attrs)
530                       if x and x.startswith('suite:')])
531        return suite_list[0] if suite_list else suite_name
532    return suite_name
533
534
535class LogLink(object):
536    """Information needed to record a link in the logs.
537
538    Depending on context and the information provided at
539    construction time, the link may point to either to log files for
540    a job, or to a bug filed for a failure in the job.
541
542    @var anchor  The link text.
543    @var url     The link url.
544    @var bug_id  Id of a bug to link to, or None.
545    """
546
547    # A list of tests that don't get retried so skip the dashboard.
548    _SKIP_RETRY_DASHBOARD = ['provision']
549
550    _BUG_LINK_PREFIX = 'Auto-Bug'
551    _LOG_LINK_PREFIX = 'Test-Logs'
552
553
554    def __init__(self, anchor, server, job_string, bug_info=None, reason=None,
555                 retry_count=0, testname=None, sponge_url=None):
556        """Initialize the LogLink by generating the log URL.
557
558        @param anchor      The link text.
559        @param server      The hostname of the server this suite ran on.
560        @param job_string  The job whose logs we'd like to link to.
561        @param bug_info    Info about the bug, if one was filed.
562        @param reason      A string representing the reason of failure if any.
563        @param retry_count How many times the test has been retried.
564        @param testname    Optional Arg that supplies the testname.
565        @param sponge_url  url to Sponge result.
566        """
567        self.anchor = anchor
568        self.url = _URL_PATTERN % (rpc_client_lib.add_protocol(server),
569                                   job_string)
570        self.reason = reason
571        self.retry_count = retry_count
572        self.testname = testname
573        self.sponge_url = sponge_url
574        if bug_info:
575            self.bug_id, self.bug_count = bug_info
576        else:
577            self.bug_id = None
578            self.bug_count = None
579
580
581    @property
582    def bug_url(self):
583        """URL of associated bug."""
584        if self.bug_id:
585            return reporting_utils.link_crbug(self.bug_id)
586        else:
587            return None
588
589
590    @property
591    def _bug_count_text(self):
592        """Return bug count as human friendly text."""
593        if self.bug_count is None:
594            bug_info = 'unknown number of reports'
595        elif self.bug_count == 1:
596            bug_info = 'new report'
597        else:
598            bug_info = '%s reports' % self.bug_count
599        return bug_info
600
601
602    def GenerateBuildbotLinks(self):
603        """Generate a link formatted to meet buildbot expectations.
604
605        If there is a bug associated with this link, report a link to the bug
606        and a link to the job logs; otherwise report a link to the job logs.
607
608        @return A generator of links formatted for the buildbot log annotator.
609        """
610        if self.bug_url:
611            yield self._get_link_to_bug()
612        yield self._get_link_to_job_logs()
613
614
615    def _get_link_to_bug(self):
616        """Return buildbot link to bug.
617
618        @return A link formatted for the buildbot log annotator.
619        """
620        info_strings = self._get_info_strings()
621        info_strings.append(self._bug_count_text)
622        anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX,
623                                               info_strings)
624        return annotations.StepLink(anchor_text, self.bug_url)
625
626
627    def _get_link_to_job_logs(self):
628        """Return buildbot link to job logs.
629
630        @return A link formatted for the buildbot log annotator.
631        """
632        anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX,
633                                               self._get_info_strings())
634        return annotations.StepLink(anchor_text, self.url)
635
636
637    def _get_info_strings(self):
638        """Return a list of info strings for _format_anchor_text()."""
639        info_strings = []
640        if self.retry_count > 0:
641            info_strings.append('retry_count: %d' % self.retry_count)
642        if self.reason:
643            info_strings.append(self.reason)
644        return info_strings
645
646
647    def _format_anchor_text(self, prefix, info_strings):
648        """Format anchor text given a prefix and info strings.
649
650        @param prefix        The prefix of the anchor text.
651        @param info_strings  Iterable of strings.
652        @return A anchor_text with the right prefix and info strings.
653        """
654        return '[{prefix}]: {anchor}: {info}'.format(
655            prefix=prefix,
656            anchor=self.anchor.strip(),
657            info=', '.join(info_strings))
658
659    @property
660    def text_link(self):
661        """Link to the job's logs, for consumption by a human.
662
663        @return A link formatted for human readability.
664        """
665        return '%s %s' % (self.anchor, self.url)
666
667    def GenerateRetryLink(self):
668        """Generate a link to the retry dashboard.
669
670        @return A link formatted for the buildbot log annotator.
671        """
672        if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
673            return None
674        return annotations.StepLink(
675            text='[Flake-Dashboard]: %s' % self.testname,
676            url=reporting_utils.link_retry_url(self.testname))
677
678    def GenerateHistoryLink(self):
679        """Generate a link to the test history dashboard.
680
681        @return A link formatted for the buildbot log annotator.
682        """
683        if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
684            return None
685        return annotations.StepLink(
686            text='[Test-History]: %s' % self.testname,
687            url=reporting_utils.link_test_history(self.testname))
688
689
690class Timings(object):
691    """Timings for important events during a suite.
692
693    All timestamps are datetime.datetime objects.
694
695    @var suite_job_id: the afe job id of the suite job for which
696                       we are recording the timing for.
697    @var download_start_time: the time the devserver starts staging
698                              the build artifacts. Recorded in create_suite_job.
699    @var payload_end_time: the time when the artifacts only necessary to start
700                           installsing images onto DUT's are staged.
701                           Recorded in create_suite_job.
702    @var artifact_end_time: the remaining artifacts are downloaded after we kick
703                            off the reimaging job, at which point we record
704                            artifact_end_time. Recorded in dynamic_suite.py.
705    @var suite_start_time: the time the suite started.
706    @var tests_start_time: the time the first test started running.
707    @var tests_end_time: the time the last test finished running.
708    """
709
710    def __init__(self, suite_job_id):
711        self.suite_job_id = suite_job_id
712        # Timings related to staging artifacts on devserver.
713        self.download_start_time = None
714        self.payload_end_time = None
715        self.artifact_end_time = None
716
717        # The test_start_time, but taken off the view that corresponds to the
718        # suite instead of an individual test.
719        self.suite_start_time = None
720
721        # Earliest and Latest tests in the set of TestViews passed to us.
722        self.tests_start_time = None
723        self.tests_end_time = None
724
725
726    def RecordTiming(self, view):
727        """Given a test report view, extract and record pertinent time info.
728
729        get_detailed_test_views() returns a list of entries that provide
730        info about the various parts of a suite run.  This method can take
731        any one of these entries and look up timestamp info we might want
732        and record it.
733
734        If timestamps are unavailable, datetime.datetime.min/max will be used.
735
736        @param view: A TestView object.
737        """
738        start_candidate = datetime.min
739        end_candidate = datetime.max
740        if view['test_started_time']:
741            start_candidate = time_utils.time_string_to_datetime(
742                    view['test_started_time'])
743        if view['test_finished_time']:
744            end_candidate = time_utils.time_string_to_datetime(
745                    view['test_finished_time'])
746
747        if view.get_testname() == TestView.SUITE_JOB:
748            self.suite_start_time = start_candidate
749        else:
750            self._UpdateFirstTestStartTime(start_candidate)
751            self._UpdateLastTestEndTime(end_candidate)
752        if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view:
753            keyvals = view['job_keyvals']
754            self.download_start_time = time_utils.time_string_to_datetime(
755                    keyvals.get(constants.DOWNLOAD_STARTED_TIME),
756                    handle_type_error=True)
757
758            self.payload_end_time = time_utils.time_string_to_datetime(
759                    keyvals.get(constants.PAYLOAD_FINISHED_TIME),
760                    handle_type_error=True)
761
762            self.artifact_end_time = time_utils.time_string_to_datetime(
763                    keyvals.get(constants.ARTIFACT_FINISHED_TIME),
764                    handle_type_error=True)
765
766
767    def _UpdateFirstTestStartTime(self, candidate):
768        """Update self.tests_start_time, iff candidate is an earlier time.
769
770        @param candidate: a datetime.datetime object.
771        """
772        if not self.tests_start_time or candidate < self.tests_start_time:
773            self.tests_start_time = candidate
774
775
776    def _UpdateLastTestEndTime(self, candidate):
777        """Update self.tests_end_time, iff candidate is a later time.
778
779        @param candidate: a datetime.datetime object.
780        """
781        if not self.tests_end_time or candidate > self.tests_end_time:
782            self.tests_end_time = candidate
783
784
785    def __str__(self):
786        return ('\n'
787                'Suite timings:\n'
788                'Downloads started at %s\n'
789                'Payload downloads ended at %s\n'
790                'Suite started at %s\n'
791                'Artifact downloads ended (at latest) at %s\n'
792                'Testing started at %s\n'
793                'Testing ended at %s\n' % (self.download_start_time,
794                                           self.payload_end_time,
795                                           self.suite_start_time,
796                                           self.artifact_end_time,
797                                           self.tests_start_time,
798                                           self.tests_end_time))
799
800
801def instance_for_pool(pool_name):
802    """
803    Return the hostname of the server that should be used to service a suite
804    for the specified pool.
805
806    @param pool_name: The pool (without 'pool:' to schedule the suite against.
807    @return: The correct host that should be used to service this suite run.
808    """
809    return CONFIG.get_config_value(
810            'POOL_INSTANCE_SHARDING', pool_name,
811            default=_DEFAULT_AUTOTEST_INSTANCE)
812
813
814class TestView(object):
815    """Represents a test view and provides a set of helper functions."""
816
817
818    SUITE_JOB = 'Suite job'
819
820
821    def __init__(self, view, afe_job, suite_name, build, user,
822                 solo_test_run=False):
823        """Init a TestView object representing a tko test view.
824
825        @param view: A dictionary representing a tko test view.
826        @param afe_job: An instance of frontend.afe.models.Job
827                        representing the job that kicked off the test.
828        @param suite_name: The name of the suite
829                           that the test belongs to.
830        @param build: The build for which the test is run.
831        @param user: The user for which the test is run.
832        @param solo_test_run: This is a solo test run not part of a suite.
833        """
834        self.view = view
835        self.afe_job = afe_job
836        self.suite_name = suite_name
837        self.build = build
838        self.is_suite_view = afe_job.parent_job is None and not solo_test_run
839        # This is the test name that will be shown in the output.
840        self.testname = None
841        self.user = user
842
843        # The case that a job was aborted before it got a chance to run
844        # usually indicates suite has timed out (unless aborted by user).
845        # In this case, the abort reason will be None.
846        # Update the reason with proper information.
847        if (self.is_relevant_suite_view() and
848                not self.get_testname() == self.SUITE_JOB and
849                self.view['status'] == 'ABORT' and
850                not self.view['reason']):
851            self.view['reason'] = 'Timed out, did not run.'
852
853
854    def __getitem__(self, key):
855        """Overload __getitem__ so that we can still use []
856
857        @param key: A key of the tko test view.
858
859        @returns: The value of an attribute in the view.
860
861        """
862        return self.view[key]
863
864
865    def __iter__(self):
866        """Overload __iter__ so that it supports 'in' operator."""
867        return iter(self.view)
868
869
870    def get_testname(self):
871        """Get test name that should be shown in the output.
872
873        Formalize the test_name we got from the test view.
874
875        Remove 'build/suite' prefix if any.
876
877        If one runs a test in control file via the following code,
878           job.runtest('my_Test', tag='tag')
879        for most of the cases, view['test_name'] would look like 'my_Test.tag'.
880        If this is the case, this method will just return the original
881        test name, i.e. 'my_Test.tag'.
882
883        There are four special cases.
884        1) A test view is for the suite job's SERVER_JOB.
885           In this case, this method will return 'Suite job'.
886
887        2) A test view is of a child job or a solo test run not part of a
888           suite, and for a SERVER_JOB or CLIENT_JOB.
889           In this case, we will take the job name, remove the build/suite
890           prefix from the job name, and append the rest to 'SERVER_JOB'
891           or 'CLIENT_JOB' as a prefix. So the names returned by this
892           method will look like:
893             'dummy_Pass_SERVER_JOB'
894             'dummy_Fail_SERVER_JOB'
895
896        3) A test view is of a suite job and its status is ABORT.
897           In this case, the view['test_name'] is the child job's name.
898           For instance,
899             'lumpy-release/R35-5712.0.0/dummy/dummy_Pass'
900             'lumpy-release/R35-5712.0.0/dummy/dummy_Fail'
901           The above names will be converted to the following:
902             'dummy_Pass'
903             'dummy_Fail'
904
905        4) A test view's status is of a suite job and its status is TEST_NA.
906           In this case, the view['test_name'] is the NAME field of the control
907           file. For instance,
908             'dummy_Pass'
909             'dummy_Fail'
910           This method will not modify these names.
911
912        @returns: Test name after normalization.
913
914        """
915        if self.testname is not None:
916            return self.testname
917
918        if (self.is_suite_view and
919                self.view['test_name'].startswith('SERVER_JOB')):
920            # Rename suite job's SERVER_JOB to 'Suite job'.
921            self.testname = self.SUITE_JOB
922            return self.testname
923
924        if (self.view['test_name'].startswith('SERVER_JOB') or
925                self.view['test_name'].startswith('CLIENT_JOB')):
926            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
927            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
928        else:
929            testname = self.view['test_name']
930        # Remove the build and suite name from testname if any.
931        self.testname = tools.get_test_name(
932                self.build, self.suite_name, testname)
933        return self.testname
934
935
936    def is_relevant_suite_view(self):
937        """Checks whether this is a suite view we should care about.
938
939        @returns: True if it is relevant. False otherwise.
940        """
941        return (self.get_testname() == self.SUITE_JOB or
942                (self.is_suite_view and
943                    not self.view['test_name'].startswith('CLIENT_JOB') and
944                    not self.view['subdir']))
945
946
947    def is_test(self):
948        """Return whether the view is for an actual test.
949
950        @returns True if the view is for an actual test.
951                 False if the view is for SERVER_JOB or CLIENT_JOB.
952
953        """
954        return not (self.view['test_name'].startswith('SERVER_JOB') or
955                self.view['test_name'].startswith('CLIENT_JOB'))
956
957
958    def is_retry(self):
959        """Check whether the view is for a retry.
960
961        @returns: True, if the view is for a retry; False otherwise.
962
963        """
964        return self.view['job_keyvals'].get('retry_original_job_id') is not None
965
966
967    def hit_timeout(self):
968        """Check whether the corresponding job has hit its own timeout.
969
970        Note this method should not be called for those test views
971        that belongs to a suite job and are determined as irrelevant
972        by is_relevant_suite_view.  This is because they are associated
973        to the suite job, whose job start/finished time make no sense
974        to an irrelevant test view.
975
976        @returns: True if the corresponding afe job has hit timeout.
977                  False otherwise.
978        """
979        if (self.is_relevant_suite_view() and
980                self.get_testname() != self.SUITE_JOB):
981            # Any relevant suite test view except SUITE_JOB
982            # did not hit its own timeout because it was not ever run.
983            return False
984        start = (datetime.strptime(
985                self.view['job_started_time'], time_utils.TIME_FMT)
986                if self.view['job_started_time'] else None)
987        end = (datetime.strptime(
988                self.view['job_finished_time'], time_utils.TIME_FMT)
989                if self.view['job_finished_time'] else None)
990        if not start or not end:
991            return False
992        else:
993            return ((end - start).total_seconds()/60.0
994                        > self.afe_job.max_runtime_mins)
995
996
997    def is_aborted(self):
998        """Check if the view was aborted.
999
1000        For suite job and child job test views, we check job keyval
1001        'aborted_by' and test status.
1002
1003        For relevant suite job test views, we only check test status
1004        because the suite job keyval won't make sense to individual
1005        test views.
1006
1007        @returns: True if the test was as aborted, False otherwise.
1008
1009        """
1010
1011        if (self.is_relevant_suite_view() and
1012                self.get_testname() != self.SUITE_JOB):
1013            return self.view['status'] == 'ABORT'
1014        else:
1015            return (bool(self.view['job_keyvals'].get('aborted_by')) and
1016                    self.view['status'] in ['ABORT', 'RUNNING'])
1017
1018
1019    def is_in_fail_status(self):
1020        """Check if the given test's status corresponds to a failure.
1021
1022        @returns: True if the test's status is FAIL or ERROR. False otherwise.
1023
1024        """
1025        # All the statuses tests can have when they fail.
1026        return self.view['status'] in ['FAIL', 'ERROR', 'ABORT']
1027
1028
1029    def is_provision(self):
1030        """Check whether this is a provision test."""
1031        return self.get_testname() == 'provision'
1032
1033
1034    def get_buildbot_link_reason(self):
1035        """Generate the buildbot link reason for the test.
1036
1037        @returns: A string representing the reason.
1038
1039        """
1040        return ('%s: %s' % (self.view['status'], self.view['reason'])
1041                if self.view['reason'] else self.view['status'])
1042
1043
1044    def get_job_id_owner_str(self):
1045        """Generate the job_id_owner string for a test.
1046
1047        @returns: A string which looks like 135036-username
1048
1049        """
1050        return '%s-%s' % (self.view['afe_job_id'], self.user)
1051
1052
1053    def get_bug_info(self, suite_job_keyvals):
1054        """Get the bug info from suite_job_keyvals.
1055
1056        If a bug has been filed for the test, its bug info (bug id and counts)
1057        will be stored in the suite job's keyvals. This method attempts to
1058        retrieve bug info of the test from |suite_job_keyvals|. It will return
1059        None if no bug info is found. No need to check bug info if the view is
1060        SUITE_JOB.
1061
1062        @param suite_job_keyvals: The job keyval dictionary of the suite job.
1063                All the bug info about child jobs are stored in
1064                suite job's keyvals.
1065
1066        @returns: None if there is no bug info, or a pair with the
1067                  id of the bug, and the count of the number of
1068                  times the bug has been seen.
1069
1070        """
1071        if self.get_testname() == self.SUITE_JOB:
1072            return None
1073        if (self.view['test_name'].startswith('SERVER_JOB') or
1074                self.view['test_name'].startswith('CLIENT_JOB')):
1075            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
1076            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
1077        else:
1078            testname = self.view['test_name']
1079
1080        return tools.get_test_failure_bug_info(
1081                suite_job_keyvals, self.view['afe_job_id'],
1082                testname)
1083
1084
1085    def should_display_buildbot_link(self):
1086        """Check whether a buildbot link should show for this view.
1087
1088        For suite job view, show buildbot link if it fails.
1089        For normal test view,
1090            show buildbot link if it is a retry
1091            show buildbot link if it hits its own timeout.
1092            show buildbot link if it fails. This doesn't
1093            include the case where it was aborted but has
1094            not hit its own timeout (most likely it was aborted because
1095            suite has timed out).
1096
1097        @returns: True if we should show the buildbot link.
1098                  False otherwise.
1099        """
1100        is_bad_status = (self.view['status'] != 'GOOD' and
1101                         self.view['status'] != 'TEST_NA')
1102        if self.get_testname() == self.SUITE_JOB:
1103            return is_bad_status
1104        else:
1105            if self.is_retry():
1106                return True
1107            if is_bad_status:
1108                return not self.is_aborted() or self.hit_timeout()
1109
1110
1111    def get_control_file_attributes(self):
1112        """Get the attributes from the control file of the test.
1113
1114        @returns: A list of test attribute or None.
1115        """
1116        control_file = self.afe_job.control_file
1117        attributes = None
1118        if control_file:
1119            cd = control_data.parse_control_string(control_file)
1120            attributes = list(cd.attributes)
1121        return attributes
1122
1123
1124    def override_afe_job_id(self, afe_job_id):
1125        """Overrides the AFE job id for the test.
1126
1127        @param afe_job_id: The new AFE job id to use.
1128        """
1129        self.view['afe_job_id'] = afe_job_id
1130
1131
1132def log_buildbot_links(log_func, links):
1133    """Output buildbot links to log.
1134
1135    @param log_func: Logging function to use.
1136    @param links: Iterable of LogLink instances.
1137    """
1138    for link in links:
1139        for generated_link in link.GenerateBuildbotLinks():
1140            log_func(generated_link)
1141        retry_link = link.GenerateRetryLink()
1142        if retry_link:
1143            log_func(retry_link)
1144        history_link = link.GenerateHistoryLink()
1145        if history_link:
1146            log_func(history_link)
1147
1148
1149class _ReturnCodeComputer(object):
1150    """This is responsible for returning the _ReturnResult for a suite."""
1151
1152    def __call__(self, test_views):
1153        """Compute the exit code based on test results."""
1154        result = _RETURN_RESULTS['ok']
1155
1156        for v in test_views:
1157            if v.get_testname() == TestView.SUITE_JOB:
1158                result |= self._get_suite_result(v)
1159            else:
1160                result |= self._get_test_result(v)
1161        return result
1162
1163    def _get_suite_result(self, test_view):
1164        """Return the _ReturnResult for the given suite job."""
1165        # The order of checking each case is important.
1166        if test_view.is_aborted() and test_view.hit_timeout():
1167            return _RETURN_RESULTS['suite_timeout']
1168        elif test_view.is_in_fail_status():
1169            return _RETURN_RESULTS['suite_failed']
1170        elif test_view['status'] == 'WARN':
1171            return _RETURN_RESULTS['suite_warning']
1172        else:
1173            return _RETURN_RESULTS['ok']
1174
1175    def _get_test_result(self, test_view):
1176        """Return the _ReturnResult for the given test job."""
1177        # The order of checking each case is important.
1178        if test_view.is_aborted() and test_view.is_relevant_suite_view():
1179            # The test was aborted before started
1180            # This gurantees that the suite has timed out.
1181            return _RETURN_RESULTS['test_aborted_prestart']
1182        elif test_view.is_aborted() and not test_view.hit_timeout():
1183            # The test was aborted, but
1184            # not due to a timeout. This is most likely
1185            # because the suite has timed out, but may
1186            # also because it was aborted by the user.
1187            # Since suite timing out is determined by checking
1188            # the suite job view, we simply ignore this view here.
1189            return _RETURN_RESULTS['test_aborted_mystery']
1190        elif test_view.is_in_fail_status():  # The test job failed
1191            if test_view.is_provision():
1192                return _RETURN_RESULTS['provision_failed']
1193            else:
1194                return _RETURN_RESULTS['test_failure']
1195        elif test_view['status'] == 'WARN':
1196            return _RETURN_RESULTS['test_warning']
1197        elif test_view.is_retry():
1198            # The test is a passing retry.
1199            return _RETURN_RESULTS['test_retry']
1200        else:
1201            return _RETURN_RESULTS['ok']
1202
1203
1204class _ProvisionReturnCodeComputer(_ReturnCodeComputer):
1205    """This is used for returning the _ReturnResult for provision suites."""
1206
1207    def __init__(self, num_required):
1208        """Initialize instance.
1209
1210        num_required is the number of passing provision jobs needed.
1211        """
1212        super(_ProvisionReturnCodeComputer, self).__init__()
1213        self._num_required = num_required
1214        self._num_successful = 0
1215
1216    def __call__(self, test_views):
1217        result = super(_ProvisionReturnCodeComputer, self).__call__(test_views)
1218        if self._num_successful >= self._num_required:
1219            logging.info('Return result upgraded from %r'
1220                         ' due to enough ok provisions',
1221                         result)
1222            return _RETURN_RESULTS['ok']
1223        else:
1224            return result
1225
1226    def _get_test_result(self, test_view):
1227        result = (super(_ProvisionReturnCodeComputer, self)
1228                  ._get_test_result(test_view))
1229        if result in {_RETURN_RESULTS[s] for s in ('ok', 'test_retry')}:
1230            self._num_successful += 1
1231        return result
1232
1233
1234class ResultCollector(object):
1235    """Collect test results of a suite or a single test run.
1236
1237    Once a suite job has finished, use this class to collect test results.
1238    `run` is the core method that is to be called first. Then the caller
1239    could retrieve information like return code, return message, is_aborted,
1240    and timings by accessing the collector's public attributes. And output
1241    the test results and links by calling the 'output_*' methods.
1242
1243    Here is a overview of what `run` method does.
1244
1245    1) Collect the suite job's results from tko_test_view_2.
1246    For the suite job, we only pull test views without a 'subdir'.
1247    A NULL subdir indicates that the test was _not_ executed. This could be
1248    that no child job was scheduled for this test or the child job got
1249    aborted before starts running.
1250    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
1251
1252    2) Collect the child jobs' results from tko_test_view_2.
1253    For child jobs, we pull all the test views associated with them.
1254    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
1255
1256    3) Generate web and buildbot links.
1257    4) Compute timings of the suite run.
1258    5) Compute the return code based on test results.
1259
1260    @var _instance_server: The hostname of the server that is used
1261                           to service the suite.
1262    @var _afe: The afe rpc client.
1263    @var _tko: The tko rpc client.
1264    @var _build: The build for which the suite is run,
1265                 e.g. 'lumpy-release/R35-5712.0.0'
1266    @var _board: The target board for which the suite is run,
1267                 e.g., 'lumpy', 'link'.
1268    @var _suite_name: The suite name, e.g. 'bvt', 'dummy'.
1269    @var _suite_job_id: The job id of the suite for which we are going to
1270                        collect results.
1271    @var _original_suite_name: The suite name we record timing would be
1272                               different from _suite_name when running
1273                               suite_attr_wrapper.
1274    @var _return_code_function: Called to return what the overall result of
1275                                the suite is.
1276    @var _suite_views: A list of TestView objects, representing relevant
1277                       test views of the suite job.
1278    @var _child_views: A list of TestView objects, representing test views
1279                       of the child jobs.
1280    @var _test_views: A list of TestView objects, representing all test views
1281                      from _suite_views and _child_views.
1282    @var _web_links: A list of web links pointing to the results of jobs.
1283    @var buildbot_links: A list of buildbot links for non-passing tests.
1284    @var _solo_test_run: True if this is a single test run.
1285    @var return_result: The _ReturnResult of the suite run.
1286    @var is_aborted: Whether the suite was aborted or not.
1287                     True, False or None (aborting status is unknown yet)
1288    @var timings: A Timing object that records the suite's timings.
1289
1290    """
1291
1292
1293    def __init__(self, instance_server, afe, tko, build, board,
1294                 suite_name, suite_job_id,
1295                 return_code_function,
1296                 original_suite_name=None,
1297                 user=None, solo_test_run=False):
1298        self._instance_server = instance_server
1299        self._afe = afe
1300        self._tko = tko
1301        self._build = build
1302        self._board = board
1303        self._suite_name = suite_name
1304        self._suite_job_id = suite_job_id
1305        self._original_suite_name = original_suite_name or suite_name
1306        self._return_code_function = return_code_function
1307        self._suite_views = []
1308        self._child_views = []
1309        self._test_views = []
1310        self._retry_counts = {}
1311        self._missing_results = {}
1312        self._web_links = []
1313        self.buildbot_links = []
1314        self._num_child_jobs = 0
1315        self.return_result = None
1316        self.is_aborted = None
1317        self.timings = None
1318        self._user = user or getpass.getuser()
1319        self._solo_test_run = solo_test_run
1320
1321
1322    def _fetch_relevant_test_views_of_suite(self):
1323        """Fetch relevant test views of the suite job.
1324
1325        For the suite job, there will be a test view for SERVER_JOB, and views
1326        for results of its child jobs. For example, assume we've created
1327        a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail,
1328        dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while
1329        dummy_Path.bluetooth got TEST_NA as no duts have bluetooth.
1330        So the suite job's test views would look like
1331        _____________________________________________________________________
1332        test_idx| job_idx|test_name           |subdir      |afe_job_id|status
1333        10      | 1000   |SERVER_JOB          |----        |40        |GOOD
1334        11      | 1000   |dummy_Pass          |NULL        |40        |ABORT
1335        12      | 1000   |dummy_Fail.Fail     |41-onwer/...|40        |FAIL
1336        13      | 1000   |dummy_Fail.Error    |42-owner/...|40        |ERROR
1337        14      | 1000   |dummy_Pass.bluetooth|NULL        |40        |TEST_NA
1338
1339        For a suite job, we only care about
1340        a) The test view for the suite job's SERVER_JOB
1341        b) The test views for real tests without a subdir. A NULL subdir
1342           indicates that a test didn't get executed.
1343        So, for the above example, we only keep test views whose test_idxs
1344        are 10, 11, 14.
1345
1346        @returns: A list of TestView objects, representing relevant
1347                  test views of the suite job.
1348
1349        """
1350        suite_job = self._afe.get_jobs(id=self._suite_job_id)[0]
1351        views = self._tko.run(call='get_detailed_test_views',
1352                              afe_job_id=self._suite_job_id)
1353        relevant_views = []
1354        for v in views:
1355            v = TestView(v, suite_job, self._suite_name, self._build, self._user,
1356                         solo_test_run=self._solo_test_run)
1357            if v.is_relevant_suite_view():
1358                # If the test doesn't have results in TKO and is being
1359                # displayed in the suite view instead of the child view,
1360                # then afe_job_id is incorrect and from the suite.
1361                # Override it based on the AFE job id which was missing
1362                # results.
1363                # TODO: This is likely inaccurate if a test has multiple
1364                # tries which all fail TKO parse stage.
1365                if v['test_name'] in self._missing_results:
1366                    v.override_afe_job_id(
1367                            self._missing_results[v['test_name']][0])
1368                relevant_views.append(v)
1369        return relevant_views
1370
1371
1372    def _compute_retry_count(self, view):
1373        """Return how many times the test has been retried.
1374
1375        @param view: A TestView instance.
1376        @returns: An int value indicating the retry count.
1377
1378        """
1379        old_job = view['job_keyvals'].get('retry_original_job_id')
1380        count = 0
1381        while old_job:
1382            count += 1
1383            views = self._tko.run(
1384                call='get_detailed_test_views', afe_job_id=old_job)
1385            old_job = (views[0]['job_keyvals'].get('retry_original_job_id')
1386                       if views else None)
1387        return count
1388
1389
1390    def _fetch_test_views_of_child_jobs(self, jobs=None):
1391        """Fetch test views of child jobs.
1392
1393        @returns: A tuple (child_views, retry_counts, missing_results)
1394                  child_views is list of TestView objects, representing
1395                  all valid views.
1396                  retry_counts is a dictionary that maps test_idx to retry
1397                  counts. It only stores retry counts that are greater than 0.
1398                  missing_results is a dictionary that maps test names to
1399                  lists of job ids.
1400
1401        """
1402        child_views = []
1403        retry_counts = {}
1404        missing_results = {}
1405        child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id)
1406        if child_jobs:
1407            self._num_child_jobs = len(child_jobs)
1408        for job in child_jobs:
1409            views = [TestView(v, job, self._suite_name, self._build, self._user)
1410                     for v in self._tko.run(
1411                         call='get_detailed_test_views', afe_job_id=job.id,
1412                         invalid=0)]
1413            if len(views) == 0:
1414                missing_results.setdefault(job.name, []).append(job.id)
1415            contains_test_failure = any(
1416                    v.is_test() and v['status'] != 'GOOD' for v in views)
1417            for v in views:
1418                if (v.is_test() or
1419                        v['status'] != 'GOOD' and not contains_test_failure):
1420                    # For normal test view, just keep it.
1421                    # For SERVER_JOB or CLIENT_JOB, only keep it
1422                    # if it fails and no other test failure.
1423                    child_views.append(v)
1424                    retry_count = self._compute_retry_count(v)
1425                    if retry_count > 0:
1426                        retry_counts[v['test_idx']] = retry_count
1427        return child_views, retry_counts, missing_results
1428
1429
1430    def _generate_web_and_buildbot_links(self):
1431        """Generate web links and buildbot links."""
1432        # TODO(fdeng): If a job was aborted before it reaches Running
1433        # state, we read the test view from the suite job
1434        # and thus this method generates a link pointing to the
1435        # suite job's page for the aborted job. Need a fix.
1436        self._web_links = []
1437        self.buildbot_links = []
1438
1439        # Bug info are stored in the suite job's keyvals.
1440        if self._solo_test_run:
1441            suite_job_keyvals = {}
1442        elif not self._suite_views:
1443            suite_job_keyvals = {}
1444        else:
1445            suite_job_keyvals = self._suite_views[0]['job_keyvals']
1446
1447        for v in self._test_views:
1448            retry_count = self._retry_counts.get(v['test_idx'], 0)
1449            bug_info = v.get_bug_info(suite_job_keyvals)
1450            job_id_owner = v.get_job_id_owner_str()
1451            link = LogLink(
1452                    anchor=v.get_testname(),
1453                    server=self._instance_server,
1454                    job_string=job_id_owner,
1455                    bug_info=bug_info, retry_count=retry_count,
1456                    testname=v.get_testname(),
1457                    sponge_url=suite_job_keyvals.get('sponge_url'))
1458            self._web_links.append(link)
1459
1460            if v.should_display_buildbot_link():
1461                link.reason = v.get_buildbot_link_reason()
1462                self.buildbot_links.append(link)
1463
1464
1465    def _record_timings(self):
1466        """Record suite timings."""
1467        self.timings = Timings(self._suite_job_id)
1468        for v in self._test_views:
1469            self.timings.RecordTiming(v)
1470
1471
1472    def _compute_return_code(self):
1473        """Compute the exit code based on test results."""
1474        self.return_result = self._return_code_function(self._test_views)
1475
1476
1477    def _make_test_results(self):
1478        """Make TestResults for collected tests.
1479
1480        @returns: List of TestResult instances.
1481        """
1482        test_results = []
1483        for test_view in self._test_views:
1484            test_result = TestResult(
1485                test_view=test_view,
1486                retry_count=self._retry_counts.get(test_view['test_idx'], 0))
1487            test_results.append(test_result)
1488        return test_results
1489
1490
1491    def output_results(self):
1492        """Output test results, timings and web links."""
1493        # Output test results
1494        test_results = self._make_test_results()
1495        if len(test_results) == 0:
1496            max_name_length = 0
1497        else:
1498            max_name_length = max(len(t.name) for t in test_results)
1499        for test_result in test_results:
1500            test_result.log_using(logging.info, max_name_length + 3)
1501        # Output suite timings
1502        logging.info(self.timings)
1503        # Output links to test logs
1504        logging.info('\nLinks to test logs:')
1505        for link in self._web_links:
1506            logging.info(link.text_link)
1507        logging.info('\n')
1508
1509
1510    def get_results_dict(self):
1511        """Write test results, timings and web links into a dict.
1512
1513        @returns: A dict of results in the format like:
1514                  {
1515                  'tests': {
1516                        'test_1': {'status': 'PASSED', 'attributes': [1,2], ...}
1517                        'test_2': {'status': 'FAILED', 'attributes': [1],...}
1518                  }
1519                  'suite_timings': {
1520                        'download_start': '1998-07-17 00:00:00',
1521                        'payload_download_end': '1998-07-17 00:00:05',
1522                        ...
1523                  }
1524                  }
1525        """
1526        output_dict = {}
1527        tests_dict = output_dict.setdefault('tests', {})
1528        for v in self._test_views:
1529            test_name = v.get_testname()
1530            test_info = tests_dict.setdefault(test_name, {})
1531            test_info.update({
1532                'status': v['status'],
1533                'attributes': v.get_control_file_attributes() or list(),
1534                'reason': v['reason'],
1535                'retry_count': self._retry_counts.get(v['test_idx'], 0),
1536                })
1537            # For aborted test, the control file will not be parsed and thus
1538            # fail to get the attributes info. Therefore, the subsystems the
1539            # abort test testing will be missing. For this case, we will assume
1540            # the aborted test will test all subsystems, set subsystem:default.
1541            if (test_info['status'] == 'ABORT' and
1542                not any('subsystem:' in a for a in test_info['attributes'])):
1543                test_info['attributes'].append('subsystem:default')
1544
1545        # Write the links to test logs into the |tests_dict| of |output_dict|.
1546        # For test whose status is not 'GOOD', the link is also buildbot_link.
1547        for link in self._web_links:
1548            test_name = link.anchor.strip()
1549            test_info = tests_dict.get(test_name)
1550            if test_info:
1551                test_info['link_to_logs'] = link.url
1552                test_info['sponge_url'] = link.sponge_url
1553                # Write the retry dashboard link into the dict.
1554                if link in self.buildbot_links and link.testname:
1555                    test_info['retry_dashboard_link'] \
1556                        = reporting_utils.link_retry_url(link.testname)
1557                    # Always write the wmatrix link for compatibility.
1558                    test_info['wmatrix_link'] \
1559                        = reporting_utils.link_wmatrix_retry_url(link.testname)
1560                # Write the bug url into the dict.
1561                if link.bug_id:
1562                    test_info['bug_url'] = link.bug_url
1563
1564        # Write the suite timings into |output_dict|
1565        timings = self.timings
1566        if timings is not None:
1567            time_dict = output_dict.setdefault('suite_timings', {})
1568            time_dict.update({
1569                'download_start' : str(timings.download_start_time),
1570                'payload_download_end' : str(timings.payload_end_time),
1571                'suite_start' : str(timings.suite_start_time),
1572                'artifact_download_end' : str(timings.artifact_end_time),
1573                'tests_start' : str(timings.tests_start_time),
1574                'tests_end' : str(timings.tests_end_time),
1575                })
1576
1577        output_dict['suite_job_id'] = self._suite_job_id
1578
1579        return output_dict
1580
1581
1582    def run(self):
1583        """Collect test results.
1584
1585        This method goes through the following steps:
1586            Fetch relevent test views of the suite job.
1587            Fetch test views of child jobs
1588            Check whether the suite was aborted.
1589            Generate links.
1590            Calculate suite timings.
1591            Compute return code based on the test result.
1592
1593        """
1594        if self._solo_test_run:
1595            self._test_views, self._retry_counts, self._missing_results = (
1596                  self._fetch_test_views_of_child_jobs(
1597                          jobs=self._afe.get_jobs(id=self._suite_job_id)))
1598        else:
1599            self._child_views, self._retry_counts, self._missing_results = (
1600                    self._fetch_test_views_of_child_jobs())
1601            self._suite_views = self._fetch_relevant_test_views_of_suite()
1602            self._test_views = self._suite_views + self._child_views
1603        # For hostless job in Starting status, there is no test view associated.
1604        # This can happen when a suite job in Starting status is aborted. When
1605        # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone,
1606        # max_jobs_started_per_cycle, a suite job can stays in Starting status.
1607        if not self._test_views:
1608            self.return_result = _RETURN_RESULTS['test_views_missing']
1609            return
1610        self.is_aborted = any([view['job_keyvals'].get('aborted_by')
1611                               for view in self._suite_views])
1612        self._generate_web_and_buildbot_links()
1613        self._record_timings()
1614        self._compute_return_code()
1615
1616
1617    def gather_timing_stats(self):
1618        """Collect timing related statistics."""
1619        # Record suite runtime in metadata db.
1620        # Some failure modes can leave times unassigned, report sentinel value
1621        # in that case.
1622        runtime_in_secs = -1
1623        if (self.timings.tests_end_time is not None and
1624            self.timings.suite_start_time is not None):
1625            runtime_in_secs = (self.timings.tests_end_time -
1626                    self.timings.suite_start_time).total_seconds()
1627
1628        job_overhead.record_suite_runtime(self._suite_job_id, self._suite_name,
1629                self._board, self._build, self._num_child_jobs, runtime_in_secs)
1630
1631
1632
1633def _make_builds_from_options(options):
1634    """Create a dict of builds for creating a suite job.
1635
1636    The returned dict maps version label prefixes to build names.  Together,
1637    each key-value pair describes a complete label.
1638
1639    @param options: SimpleNamespace from argument parsing.
1640
1641    @return: dict mapping version label prefixes to build names
1642    """
1643    builds = {}
1644    build_prefix = None
1645    if options.build:
1646        build_prefix = provision.get_version_label_prefix(options.build)
1647        builds[build_prefix] = options.build
1648    if options.cheets_build:
1649        builds[provision.CROS_ANDROID_VERSION_PREFIX] = options.cheets_build
1650        if build_prefix == provision.CROS_VERSION_PREFIX:
1651            builds[build_prefix] += provision.CHEETS_SUFFIX
1652    if options.firmware_rw_build:
1653        builds[provision.FW_RW_VERSION_PREFIX] = options.firmware_rw_build
1654    if options.firmware_ro_build:
1655        builds[provision.FW_RO_VERSION_PREFIX] = options.firmware_ro_build
1656    return builds
1657
1658
1659def _make_child_deps_from_options(options):
1660    """Creates a list of extra dependencies for child jobs.
1661
1662    @param options: Parsed arguments to run_suite.
1663
1664    @returns: A list of label strings if any dependencies should be added. None
1665            otherwise.
1666    """
1667    if not options.model:
1668        return ()
1669    return ['model:%s' % options.model]
1670
1671
1672@retry.retry(error.StageControlFileFailure, timeout_min=10)
1673def create_suite(afe, options):
1674    """Create a suite with retries.
1675
1676    @param afe: The afe object to insert the new suite job into.
1677    @param options: The options to use in creating the suite.
1678
1679    @return: The afe_job_id of the new suite job.
1680    """
1681    logging.info('%s Submitted create_suite_job rpc',
1682                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1683
1684    # TODO(crbug.com/763207): This is to support calling old moblab RPC
1685    # with ToT code.  This does not need to be supported after M62.
1686    if options.oldrpc:
1687        suite_args = options.suite_args
1688        if 'tests' in suite_args:
1689            # This is for test_that_wrapper
1690            suite_args = ' '.join([':lab:'] + suite_args['tests'])
1691        else:
1692            # This is for suite_attr_wrapper
1693            suite_args = repr(suite_args)
1694        options.suite_args = suite_args
1695
1696    return afe.run(
1697        'create_suite_job',
1698        name=options.name,
1699        board=options.board,
1700        builds=_make_builds_from_options(options),
1701        test_source_build=options.test_source_build,
1702        check_hosts=not options.no_wait,
1703        pool=options.pool,
1704        file_bugs=options.file_bugs,
1705        priority=options.priority,
1706        suite_args=options.suite_args,
1707        wait_for_results=not options.no_wait,
1708        timeout_mins=options.timeout_mins + options.delay_minutes,
1709        max_runtime_mins=options.max_runtime_mins + options.delay_minutes,
1710        job_retry=options.retry,
1711        max_retries=options.max_retries,
1712        suite_min_duts=options.suite_min_duts,
1713        offload_failures_only=options.offload_failures_only,
1714        run_prod_code=options.run_prod_code,
1715        delay_minutes=options.delay_minutes,
1716        job_keyvals=options.job_keyvals,
1717        test_args=options.test_args,
1718        child_dependencies=_make_child_deps_from_options(options),
1719    )
1720
1721
1722class SuiteResult(namedtuple('SuiteResult', ['return_code', 'output_dict'])):
1723    """Result of running a suite to return."""
1724
1725    def __new__(cls, return_code, output_dict=None):
1726        if output_dict is None:
1727            output_dict = dict()
1728        else:
1729            output_dict = output_dict.copy()
1730        output_dict['return_code'] = return_code
1731        return super(SuiteResult, cls).__new__(cls, return_code, output_dict)
1732
1733
1734def _run_suite(options):
1735    """
1736    run_suite script without exception handling.
1737
1738    @param options: The parsed options.
1739
1740    @returns: A tuple contains the return_code of run_suite and the dictionary
1741              of the output.
1742
1743    """
1744    # If indicate to use the new style suite control file, convert the args
1745    if options.use_suite_attr:
1746        options = change_options_for_suite_attr(options)
1747
1748    log_name = _get_log_name(options)
1749    utils.setup_logging(logfile=log_name)
1750
1751    if not options.bypass_labstatus and not options.web:
1752        utils.check_lab_status(options.build)
1753
1754    afe = _create_afe(options)
1755    instance_server = afe.server
1756
1757    rpc_helper = diagnosis_utils.RPCHelper(afe)
1758    is_real_time = True
1759    if options.mock_job_id:
1760        job_id = int(options.mock_job_id)
1761        existing_job = afe.get_jobs(id=job_id, finished=True)
1762        if existing_job:
1763            is_real_time = False
1764        else:
1765            existing_job = afe.get_jobs(id=job_id)
1766        if existing_job:
1767            job_created_on = time_utils.date_string_to_epoch_time(
1768                    existing_job[0].created_on)
1769        else:
1770            raise utils.TestLabException('Failed to retrieve job: %d' % job_id)
1771    else:
1772        try:
1773            rpc_helper.check_dut_availability(options.board, options.pool,
1774                                              options.minimum_duts,
1775                                              options.skip_duts_check)
1776            job_id = create_suite(afe, options)
1777            job_created_on = time.time()
1778        except (error.CrosDynamicSuiteException,
1779                error.RPCException, proxy.JSONRPCException) as e:
1780            logging.exception('Error Message: %s', e)
1781            return SuiteResult(RETURN_CODES.INFRA_FAILURE,
1782                               {'return_message': str(e)})
1783        except AttributeError as e:
1784            logging.exception('Error Message: %s', e)
1785            return SuiteResult(RETURN_CODES.INVALID_OPTIONS)
1786
1787    job_timer = diagnosis_utils.JobTimer(
1788            job_created_on, float(options.timeout_mins))
1789    job_url = reporting_utils.link_job(job_id,
1790                                       instance_server=instance_server)
1791    logging.info('%s Created suite job: %s',
1792                 job_timer.format_time(job_timer.job_created_time),
1793                 job_url)
1794    logging.info(annotations.StepLink(
1795        text='Link to suite',
1796        url=job_url))
1797
1798    if options.create_and_return:
1799        msg = '--create_and_return was specified, terminating now.'
1800        logging.info(msg)
1801        return SuiteResult(RETURN_CODES.OK, {'return_message': msg})
1802
1803    if options.no_wait:
1804        return _handle_job_nowait(job_id, options, instance_server)
1805    else:
1806        return _handle_job_wait(afe, job_id, options, job_timer, is_real_time)
1807
1808
1809def _get_log_name(options):
1810    """Return local log file's name.
1811
1812    @param options:         Parsed options.
1813
1814    @return log_name, a string file name.
1815    """
1816    if options.require_logfile:
1817        # options.build is verified to exist in verify_options.
1818        # convert build name from containing / to containing only _.
1819        log_name = 'run_suite-%s.log' % options.build.replace('/', '_')
1820        log_dir = os.path.join(common.autotest_dir, 'logs')
1821        if os.path.exists(log_dir):
1822            log_name = os.path.join(log_dir, log_name)
1823
1824        return log_name
1825    else:
1826        return None
1827
1828
1829def _create_afe(options):
1830    """Return an afe instance based on options.
1831
1832    @param options          Parsed options.
1833
1834    @return afe, an AFE instance.
1835    """
1836    instance_server = (options.web if options.web else
1837                       instance_for_pool(options.pool))
1838    afe = frontend_wrappers.RetryingAFE(server=instance_server,
1839                                        timeout_min=options.afe_timeout_mins,
1840                                        delay_sec=options.delay_sec)
1841    logging.info('Autotest instance created: %s', instance_server)
1842    return afe
1843
1844
1845def _handle_job_wait(afe, job_id, options, job_timer, is_real_time):
1846    """Handle suite job synchronously.
1847
1848    @param afe              AFE instance.
1849    @param job_id           Suite job id.
1850    @param options          Parsed options.
1851    @param job_timer        JobTimer for suite job.
1852    @param is_real_time     Whether or not to handle job timeout.
1853
1854    @return SuiteResult of suite job.
1855    """
1856    rpc_helper = diagnosis_utils.RPCHelper(afe)
1857    instance_server = afe.server
1858    while not afe.get_jobs(id=job_id, finished=True):
1859        _poke_buildbot_with_output(afe, job_id, job_timer)
1860        if job_timer.debug_output_timer.poll():
1861            logging.info('The suite job has another %s till timeout.',
1862                         job_timer.timeout_hours - job_timer.elapsed_time())
1863        time.sleep(10)
1864    logging.info('%s Suite job is finished.',
1865                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1866    # For most cases, ResultCollector should be able to determine whether
1867    # a suite has timed out by checking information in the test view.
1868    # However, occationally tko parser may fail on parsing the
1869    # job_finished time from the job's keyval file. So we add another
1870    # layer of timeout check in run_suite. We do the check right after
1871    # the suite finishes to make it as accurate as possible.
1872    # There is a minor race condition here where we might have aborted
1873    # for some reason other than a timeout, and the job_timer thinks
1874    # it's a timeout because of the jitter in waiting for results.
1875    # The consequence would be that run_suite exits with code
1876    # SUITE_TIMEOUT while it should  have returned INFRA_FAILURE
1877    # instead, which should happen very rarely.
1878    # Note the timeout will have no sense when using -m option.
1879    is_suite_timeout = job_timer.is_suite_timeout()
1880
1881    # Extract the original suite name to record timing.
1882    original_suite_name = get_original_suite_name(options.name,
1883                                                  options.suite_args)
1884    # Start collecting test results.
1885    logging.info('%s Start collecting test results and dump them to json.',
1886                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1887    TKO = frontend_wrappers.RetryingTKO(server=instance_server,
1888                                        timeout_min=options.afe_timeout_mins,
1889                                        delay_sec=options.delay_sec)
1890    # TODO(crbug.com/672348): It needs to be possible for provision
1891    # suite to pass if only a few tests fail.  Otherwise, a single
1892    # failing test will be reported as failure even if the suite reports
1893    # success.
1894    if options.name == _PROVISION_SUITE:
1895        # TODO(crbug.com/672348): Creating the suite job requires that
1896        # suite_args contains num_required.
1897        return_code_function = _ProvisionReturnCodeComputer(
1898            num_required=options.suite_args['num_required'])
1899    else:
1900        return_code_function = _ReturnCodeComputer()
1901    collector = ResultCollector(instance_server=instance_server,
1902                                afe=afe, tko=TKO, build=options.build,
1903                                board=options.board,
1904                                suite_name=options.name,
1905                                suite_job_id=job_id,
1906                                return_code_function=return_code_function,
1907                                original_suite_name=original_suite_name)
1908    collector.run()
1909    # Dump test outputs into json.
1910    output_dict = collector.get_results_dict()
1911    output_dict['autotest_instance'] = instance_server
1912    if not options.json_dump:
1913        collector.output_results()
1914    result = collector.return_result
1915    if is_real_time:
1916        # Do not record stats if the suite was aborted (either by a user
1917        # or through the golo rpc).
1918        # Also do not record stats if is_aborted is None, indicating
1919        # aborting status is unknown yet.
1920        if collector.is_aborted == False:
1921            logging.info('%s Gathering timing stats for the suite job.',
1922                         diagnosis_utils.JobTimer.format_time(datetime.now()))
1923            collector.gather_timing_stats()
1924
1925        if collector.is_aborted == True and is_suite_timeout:
1926            # There are two possible cases when a suite times out.
1927            # 1. the suite job was aborted due to timing out
1928            # 2. the suite job succeeded, but some child jobs
1929            #    were already aborted before the suite job exited.
1930            # The case 2 was handled by ResultCollector,
1931            # here we handle case 1.
1932            result |= _RETURN_RESULTS['suite_timeout']
1933        logging.info('\n %s Attempting to display pool info: %s',
1934                     diagnosis_utils.JobTimer.format_time(datetime.now()),
1935                     options.pool)
1936        try:
1937            # Add some jitter to make up for any latency in
1938            # aborting the suite or checking for results.
1939            cutoff = job_timer.timeout_hours + timedelta(hours=0.3)
1940            rpc_helper.diagnose_pool(
1941                    options.board, options.pool, cutoff)
1942        except proxy.JSONRPCException:
1943            logging.warning('Unable to display pool info.')
1944
1945    # And output return message.
1946    if result.message:
1947        logging.info('Reason: %s', result.message)
1948
1949    logging.info('\n %s Output below this line is for buildbot consumption:',
1950                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1951    log_buildbot_links(logging.info, collector.buildbot_links)
1952    return result.suite_result(output_dict)
1953
1954
1955def _handle_job_nowait(job_id, options, instance_server):
1956    """Handle suite job asynchronously.
1957
1958    @param job_id           Suite job id.
1959    @param options          Parsed options.
1960    @param instance_server  Autotest instance hostname.
1961
1962    @return SuiteResult of suite job.
1963    """
1964    logging.info('Created suite job: %r', job_id)
1965    link = LogLink(options.name, instance_server,
1966                   '%s-%s' % (job_id, getpass.getuser()))
1967    for generate_link in link.GenerateBuildbotLinks():
1968        logging.info(generate_link)
1969    logging.info('--no_wait specified; Exiting.')
1970    return SuiteResult(RETURN_CODES.OK,
1971                       {'return_message': '--no_wait specified; Exiting.'})
1972
1973
1974def _should_run(options):
1975    """Check whether the suite should be run based on lab/job status checking.
1976
1977    @param options          Parsed options.
1978    """
1979    try:
1980        site_utils.check_lab_status(options.test_source_build)
1981    except site_utils.TestLabException as ex:
1982        logging.exception('Lab is closed or build is blocked. Skipping '
1983                          'suite %s, board %s, build %s:  %s',
1984                          options.name, options.board,
1985                          options.test_source_build, str(ex))
1986        return False
1987
1988    start_time = str(datetime.now() -
1989                     timedelta(days=_SEARCH_JOB_MAX_DAYS))
1990    afe = _create_afe(options)
1991    afe_job_id = afe.get_jobs(
1992            name__istartswith=options.test_source_build,
1993            name__iendswith='control.'+options.name,
1994            created_on__gte=start_time,
1995            min_rpc_timeout=_MIN_RPC_TIMEOUT)
1996    if afe_job_id:
1997        logging.info('Found duplicate suite %s scheduled in past.',
1998                     afe_job_id)
1999        return False
2000
2001    return True
2002
2003
2004def _poke_buildbot_with_output(afe, job_id, job_timer):
2005    """Poke buildbot so it doesn't timeout from silence.
2006
2007    @param afe              AFE instance.
2008    @param job_id           Suite job id.
2009    @param job_timer        JobTimer for suite job.
2010    """
2011    rpc_helper = diagnosis_utils.RPCHelper(afe)
2012    # Note that this call logs output, preventing buildbot's
2013    # 9000 second silent timeout from kicking in. Let there be no
2014    # doubt, this is a hack. The timeout is from upstream buildbot and
2015    # this is the easiest work around.
2016    if job_timer.first_past_halftime():
2017        rpc_helper.diagnose_job(job_id, afe.server)
2018
2019
2020
2021def _run_task(options):
2022    """Perform this script's function minus setup.
2023
2024    Boilerplate like argument parsing, logging, output formatting happen
2025    elsewhere.
2026
2027    Returns a SuiteResult instance.
2028
2029    TODO(ayatane): The try/except should be moved into _run_suite().
2030    Good luck trying to figure out which function calls are supposed to
2031    raise which of the exceptions.
2032    """
2033    try:
2034        return _run_suite(options)
2035    except diagnosis_utils.BoardNotAvailableError as e:
2036        result = SuiteResult(
2037            RETURN_CODES.BOARD_NOT_AVAILABLE,
2038            {'return_message': 'Skipping testing: %s' % e.message})
2039        logging.info(result.output_dict['return_message'])
2040        return result
2041    except utils.TestLabException as e:
2042        result = SuiteResult(
2043            RETURN_CODES.INFRA_FAILURE,
2044            {'return_message': 'TestLabException: %s' % e})
2045        logging.exception(result.output_dict['return_message'])
2046        return result
2047
2048
2049class _ExceptionHandler(object):
2050    """Global exception handler replacement."""
2051
2052    def __init__(self, dump_json):
2053        """Initialize instance.
2054
2055        @param dump_json: Whether to print a JSON dump of the result dict to
2056                          stdout.
2057        """
2058        self._should_dump_json = dump_json
2059
2060    def __call__(self, exc_type, value, traceback):
2061        if self._should_dump_json:
2062            _dump_json({'return_message': ('Unhandled run_suite exception: %s'
2063                                           % value)})
2064        sys.exit(RETURN_CODES.INFRA_FAILURE)
2065
2066
2067def main():
2068    """Entry point."""
2069    utils.verify_not_root_user()
2070
2071    parser = make_parser()
2072    options = parser.parse_args()
2073    if options.do_nothing:
2074        return 0
2075
2076    sys.exceptionhandler = _ExceptionHandler(dump_json=options.json_dump)
2077    if options.json_dump:
2078        logging.disable(logging.CRITICAL)
2079
2080    options_okay = verify_and_clean_options(options)
2081    # Set StreamHandler first to capture error messages if suite is not run.
2082    utils.setup_logging()
2083    if not options_okay:
2084        parser.print_help()
2085        result = SuiteResult(RETURN_CODES.INVALID_OPTIONS)
2086    elif options.pre_check and not _should_run(options):
2087        logging.info('Suite %s-%s is terminated: Lab is closed, OR build is '
2088                     'blocked, OR this suite has already been kicked off '
2089                     'once in past %d days.',
2090                     options.test_source_build, options.name,
2091                     _SEARCH_JOB_MAX_DAYS)
2092        result = SuiteResult(
2093            RETURN_CODES.ERROR,
2094            {'return_message': ("Lab is closed OR other reason"
2095                                " (see code, it's complicated)")})
2096    else:
2097        result = _run_task(options)
2098
2099    if options.json_dump:
2100        _dump_json(result.output_dict)
2101
2102    logging.info('Will return from run_suite with status: %s',
2103                  RETURN_CODES.get_string(result.return_code))
2104    return result.return_code
2105
2106
2107def _dump_json(obj):
2108    """Write obj JSON to stdout."""
2109    output_json = json.dumps(obj, sort_keys=True)
2110    sys.stdout.write('#JSON_START#%s#JSON_END#' % output_json.strip())
2111
2112
2113if __name__ == "__main__":
2114    sys.exit(main())
2115