• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7
8"""Tool for running suites of tests and waiting for completion.
9
10The desired test suite will be scheduled with autotest. By default,
11this tool will block until the job is complete, printing a summary
12at the end.  Error conditions result in exceptions.
13
14This is intended for use only with Chrome OS test suits that leverage the
15dynamic suite infrastructure in server/cros/dynamic_suite.py.
16
17This script exits with one of the following codes:
180 - OK: Suite finished successfully
191 - ERROR: Test(s) failed, or hits its own timeout
202 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out.
213 - INFRA_FAILURE: Infrastructure related issues, e.g.
22    * Lab is down
23    * Too many duts (defined as a constant) in repair failed status
24    * Suite job issues, like bug in dynamic suite,
25      user aborted the suite, lose a drone/all devservers/rpc server,
26      0 tests ran, etc.
27    * provision failed
28      TODO(fdeng): crbug.com/413918, reexamine treating all provision
29                   failures as INFRA failures.
304 - SUITE_TIMEOUT: Suite timed out, some tests ran,
31    none failed by the time the suite job was aborted. This will cover,
32    but not limited to, the following cases:
33    * A devserver failure that manifests as a timeout
34    * No DUTs available midway through a suite
35    * Provision/Reset/Cleanup took longer time than expected for new image
36    * A regression in scheduler tick time.
375- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool.
386- INVALID_OPTIONS: If options are not valid.
39"""
40
41
42import datetime as datetime_base
43import ast, getpass, json, logging, optparse, os, re, sys, time
44from datetime import datetime
45
46import common
47from autotest_lib.client.common_lib import control_data
48from autotest_lib.client.common_lib import error
49from autotest_lib.client.common_lib import global_config, enum
50from autotest_lib.client.common_lib import priorities
51from autotest_lib.client.common_lib import time_utils
52from autotest_lib.client.common_lib.cros.graphite import autotest_stats
53from autotest_lib.client.common_lib.cros import retry
54from autotest_lib.frontend.afe.json_rpc import proxy
55from autotest_lib.server import utils
56from autotest_lib.server.cros import provision
57from autotest_lib.server.cros.dynamic_suite import constants
58from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
59from autotest_lib.server.cros.dynamic_suite import reporting_utils
60from autotest_lib.server.cros.dynamic_suite import tools
61from autotest_lib.site_utils import diagnosis_utils
62from autotest_lib.site_utils import job_overhead
63
64
65CONFIG = global_config.global_config
66
67WMATRIX_RETRY_URL = CONFIG.get_config_value('BUG_REPORTING',
68                                            'wmatrix_retry_url')
69
70# Return code that will be sent back to autotest_rpc_server.py
71RETURN_CODES = enum.Enum(
72        'OK', 'ERROR', 'WARNING', 'INFRA_FAILURE', 'SUITE_TIMEOUT',
73        'BOARD_NOT_AVAILABLE', 'INVALID_OPTIONS')
74# The severity of return code. If multiple codes
75# apply, the script should always return the severest one.
76# E.g. if we have a test failure and the suite also timed out,
77# we should return 'ERROR'.
78SEVERITY = {RETURN_CODES.OK: 0,
79            RETURN_CODES.WARNING: 1,
80            RETURN_CODES.SUITE_TIMEOUT: 2,
81            RETURN_CODES.INFRA_FAILURE: 3,
82            RETURN_CODES.ERROR: 4}
83ANDROID_BUILD_REGEX = r'.+/.+/[0-9]+'
84
85
86def get_worse_code(code1, code2):
87    """Compare the severity of two codes and return the worse code.
88
89    @param code1: An enum value of RETURN_CODES
90    @param code2: An enum value of RETURN_CODES
91
92    @returns: the more severe one between code1 and code2.
93
94    """
95    return code1 if SEVERITY[code1] >= SEVERITY[code2] else code2
96
97
98def parse_options():
99    #pylint: disable-msg=C0111
100    usage = "usage: %prog [options]"
101    parser = optparse.OptionParser(usage=usage)
102    parser.add_option("-b", "--board", dest="board")
103    parser.add_option("-i", "--build", dest="build")
104    parser.add_option("-w", "--web", dest="web", default=None,
105                      help="Address of a webserver to receive suite requests.")
106    parser.add_option('--firmware_rw_build', dest='firmware_rw_build',
107                      default=None,
108                      help='Firmware build to be installed in dut RW firmware.')
109    parser.add_option('--firmware_ro_build', dest='firmware_ro_build',
110                      default=None,
111                      help='Firmware build to be installed in dut RO firmware.')
112    parser.add_option('--test_source_build', dest='test_source_build',
113                      default=None,
114                      help=('Build that contains the test code, '
115                            'e.g., it can be the value of `--build`, '
116                            '`--firmware_rw_build` or `--firmware_ro_build` '
117                            'arguments. Default is None, that is, use the test '
118                            'code from `--build` (CrOS image)'))
119    #  This should just be a boolean flag, but the autotest "proxy" code
120    #  can't handle flags that don't take arguments.
121    parser.add_option("-n", "--no_wait", dest="no_wait", default="False",
122                      help='Must pass "True" or "False" if used.')
123    # If you really want no pool, --pool="" will do it. USE WITH CARE.
124    parser.add_option("-p", "--pool", dest="pool", default="suites")
125    parser.add_option("-s", "--suite_name", dest="name")
126    parser.add_option("-a", "--afe_timeout_mins", type="int",
127                      dest="afe_timeout_mins", default=30)
128    parser.add_option("-t", "--timeout_mins", type="int",
129                      dest="timeout_mins", default=1440)
130    parser.add_option("-x", "--max_runtime_mins", type="int",
131                      dest="max_runtime_mins", default=1440)
132    parser.add_option("-d", "--delay_sec", type="int",
133                      dest="delay_sec", default=10)
134    parser.add_option("-m", "--mock_job_id", dest="mock_job_id",
135                      help="Attach to existing job id for already running "
136                           "suite, and creates report.")
137    # NOTE(akeshet): This looks similar to --no_wait, but behaves differently.
138    # --no_wait is passed in to the suite rpc itself and affects the suite,
139    # while this does not.
140    parser.add_option("-c", "--create_and_return", dest="create_and_return",
141                      action="store_true",
142                      help="Create the suite and print the job id, then "
143                           "finish immediately.")
144    parser.add_option("-u", "--num", dest="num", type="int", default=None,
145                      help="Run on at most NUM machines.")
146    #  Same boolean flag issue applies here.
147    parser.add_option("-f", "--file_bugs", dest="file_bugs", default='False',
148                      help='File bugs on test failures. Must pass "True" or '
149                           '"False" if used.')
150    parser.add_option("-l", "--bypass_labstatus", dest="bypass_labstatus",
151                      action="store_true", help='Bypass lab status check.')
152    # We allow either a number or a string for the priority.  This way, if you
153    # know what you're doing, one can specify a custom priority level between
154    # other levels.
155    parser.add_option("-r", "--priority", dest="priority",
156                      default=priorities.Priority.DEFAULT,
157                      action="store", help="Priority of suite")
158    parser.add_option('--retry', dest='retry', default='False',
159                      action='store', help='Enable test retry. '
160                      'Must pass "True" or "False" if used.')
161    parser.add_option('--max_retries', dest='max_retries', default=None,
162                      type='int', action='store', help='Maximum retries'
163                      'allowed at suite level. No limit if not specified.')
164    parser.add_option('--minimum_duts', dest='minimum_duts', type=int,
165                      default=0, action='store',
166                      help='Check that the pool has at least such many '
167                           'healthy machines, otherwise suite will not run. '
168                           'Default to 0.')
169    parser.add_option('--suite_min_duts', dest='suite_min_duts', type=int,
170                      default=0, action='store',
171                      help='Preferred minimum number of machines. Scheduler '
172                           'will prioritize on getting such many machines for '
173                           'the suite when it is competing with another suite '
174                           'that has a higher priority but already got minimum '
175                           'machines it needs. Default to 0.')
176    parser.add_option("--suite_args", dest="suite_args",
177                      default=None, action="store",
178                      help="Argument string for suite control file.")
179    parser.add_option('--offload_failures_only', dest='offload_failures_only',
180                      action='store', default='False',
181                      help='Only enable gs_offloading for failed tests. '
182                           'Successful tests will be deleted. Must pass "True"'
183                           ' or "False" if used.')
184    parser.add_option('--use_suite_attr', dest='use_suite_attr',
185                      action='store_true', default=False,
186                      help='Advanced. Run the suite based on ATTRIBUTES of '
187                      'control files, rather than SUITE.')
188    parser.add_option('--json_dump', dest='json_dump', action='store_true',
189                      default=False,
190                      help='Dump the output of run_suite to stdout.')
191    parser.add_option('--run_prod_code', dest='run_prod_code',
192                      action='store_true', default=False,
193                      help='Run the test code that lives in prod aka the test '
194                           'code currently on the lab servers.')
195    options, args = parser.parse_args()
196    return parser, options, args
197
198
199def verify_options_and_args(options, args):
200    """Verify the validity of options and args.
201
202    @param options: The parsed options to verify.
203    @param args: The parsed args to verify.
204
205    @returns: True if verification passes, False otherwise.
206
207    """
208    if args:
209        print 'Unknown arguments: ' + str(args)
210        return False
211
212    if options.mock_job_id and (
213            not options.build or not options.name or not options.board):
214        print ('When using -m, need to specify build, board and suite '
215               'name which you have used for creating the original job')
216        return False
217    else:
218        if not options.build:
219            print 'Need to specify which build to use'
220            return False
221        if not options.board:
222            print 'Need to specify board'
223            return False
224        if not options.name:
225            print 'Need to specify suite name'
226            return False
227    if options.num is not None and options.num < 1:
228        print 'Number of machines must be more than 0, if specified.'
229        return False
230    if options.no_wait != 'True' and options.no_wait != 'False':
231        print 'Please specify "True" or "False" for --no_wait.'
232        return False
233    if options.file_bugs != 'True' and options.file_bugs != 'False':
234        print 'Please specify "True" or "False" for --file_bugs.'
235        return False
236    if options.retry != 'True' and options.retry != 'False':
237        print 'Please specify "True" or "False" for --retry'
238        return False
239    if options.retry == 'False' and options.max_retries is not None:
240        print 'max_retries can only be used with --retry=True'
241        return False
242    if options.use_suite_attr and options.suite_args is not None:
243        print ('The new suite control file cannot parse the suite_args: %s.'
244               'Please not specify any suite_args here.' % options.suite_args)
245        return False
246    if options.no_wait == 'True' and options.retry == 'True':
247        print 'Test retry is not available when using --no_wait=True'
248    # Default to use the test code in CrOS build.
249    if not options.test_source_build and options.build:
250        options.test_source_build = options.build
251    return True
252
253
254def change_options_for_suite_attr(options):
255    """Change options to be prepared to run the suite_attr_wrapper.
256
257    If specify 'use_suite_attr' from the cmd line, it indicates to run the
258    new style suite control file, suite_attr_wrapper. Then, change the
259    options.suite_name to 'suite_attr_wrapper', change the options.suite_args to
260    include the arguments needed by suite_attr_wrapper.
261
262    @param options: The verified options.
263
264    @returns: The changed options.
265
266    """
267    # Convert the suite_name to attribute boolean expression.
268    if type(options.name) is str:
269        attr_filter_val = 'suite:%s' % options.name
270    else:
271        attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name])
272
273    # change the suite_args to be a dict of arguments for suite_attr_wrapper
274    # if suite_args is not None, store the values in 'other_args' of the dict
275    args_dict = {}
276    args_dict['attr_filter'] = attr_filter_val
277    options.suite_args = str(args_dict)
278    options.name = 'suite_attr_wrapper'
279
280    return options
281
282
283def get_pretty_status(status):
284    """
285    Converts a status string into a pretty-for-printing string.
286
287    @param status: Status to convert.
288
289    @return: Returns pretty string.
290             GOOD    -> [ PASSED ]
291             TEST_NA -> [ INFO ]
292             other   -> [ FAILED ]
293    """
294    if status == 'GOOD':
295        return '[ PASSED ]'
296    elif status == 'TEST_NA':
297        return '[  INFO  ]'
298    return '[ FAILED ]'
299
300
301def get_original_suite_name(suite_name, suite_args):
302    """Get the original suite name when running suite_attr_wrapper.
303
304    @param suite_name: the name of the suite launched in afe. When it is
305                       suite_attr_wrapper, the suite that actually running is
306                       specified in the suite_args.
307    @param suite_args: the parsed option which contains the original suite name.
308
309    @returns: the original suite name.
310
311    """
312    if suite_name == 'suite_attr_wrapper':
313        attrs = ast.literal_eval(suite_args).get('attr_filter', '')
314        suite_list = ([x[6:] for x in re.split('[() ]', attrs)
315                       if x and x.startswith('suite:')])
316        return suite_list[0] if suite_list else suite_name
317    return suite_name
318
319
320def GetBuildbotStepLink(anchor_text, url):
321    """Generate a buildbot formatted link.
322
323    @param anchor_text    The link text.
324    @param url            The url to link to.
325    """
326    return '@@@STEP_LINK@%s@%s@@@' % (anchor_text, url)
327
328
329class LogLink(object):
330    """Information needed to record a link in the logs.
331
332    Depending on context and the information provided at
333    construction time, the link may point to either to log files for
334    a job, or to a bug filed for a failure in the job.
335
336    @var anchor  The link text.
337    @var url     The link url.
338    @var bug_id  Id of a bug to link to, or None.
339    """
340
341    _BUG_URL_PREFIX = CONFIG.get_config_value('BUG_REPORTING',
342                                              'tracker_url')
343    _URL_PATTERN = CONFIG.get_config_value('CROS',
344                                           'log_url_pattern', type=str)
345
346
347    @classmethod
348    def get_bug_link(cls, bug_id):
349        """Generate a bug link for the given bug_id.
350
351        @param bug_id: The id of the bug.
352        @return: A link, eg: https://crbug.com/<bug_id>.
353        """
354        return '%s%s' % (cls._BUG_URL_PREFIX, bug_id)
355
356
357    def __init__(self, anchor, server, job_string, bug_info=None, reason=None,
358                 retry_count=0, testname=None):
359        """Initialize the LogLink by generating the log URL.
360
361        @param anchor      The link text.
362        @param server      The hostname of the server this suite ran on.
363        @param job_string  The job whose logs we'd like to link to.
364        @param bug_info    Info about the bug, if one was filed.
365        @param reason      A string representing the reason of failure if any.
366        @param retry_count How many times the test has been retried.
367        @param testname    Optional Arg that supplies the testname.
368        """
369        self.anchor = anchor
370        self.url = self._URL_PATTERN % (server, job_string)
371        self.reason = reason
372        self.retry_count = retry_count
373        self.testname = testname
374        if bug_info:
375            self.bug_id, self.bug_count = bug_info
376        else:
377            self.bug_id = None
378            self.bug_count = None
379
380
381    def GenerateBuildbotLink(self):
382        """Generate a link formatted to meet buildbot expectations.
383
384        If there is a bug associated with this link, report that;
385        otherwise report a link to the job logs.
386
387        @return A link formatted for the buildbot log annotator.
388        """
389        info_strings = []
390        if self.retry_count > 0:
391            info_strings.append('retry_count: %d' % self.retry_count)
392
393        if self.bug_id:
394            url = self.get_bug_link(self.bug_id)
395            if self.bug_count is None:
396                bug_info = 'unknown number of reports'
397            elif self.bug_count == 1:
398                bug_info = 'new report'
399            else:
400                bug_info = '%s reports' % self.bug_count
401            info_strings.append(bug_info)
402        else:
403            url = self.url
404
405        if self.reason:
406            info_strings.append(self.reason.strip())
407
408        if info_strings:
409            info = ', '.join(info_strings)
410            anchor_text = '%(anchor)s: %(info)s' % {
411                    'anchor': self.anchor.strip(), 'info': info}
412        else:
413            anchor_text = self.anchor.strip()
414
415        return GetBuildbotStepLink(anchor_text, url)
416
417
418    def GenerateTextLink(self):
419        """Generate a link to the job's logs, for consumption by a human.
420
421        @return A link formatted for human readability.
422        """
423        return '%s%s' % (self.anchor, self.url)
424
425
426    def GenerateWmatrixRetryLink(self):
427        """Generate a link to the wmatrix retry dashboard.
428
429        @return A link formatted for the buildbot log annotator.
430        """
431        if not self.testname:
432            return None
433
434        return GetBuildbotStepLink(
435                'Flaky test dashboard view for test %s' %
436                self.testname, WMATRIX_RETRY_URL % self.testname)
437
438
439class Timings(object):
440    """Timings for important events during a suite.
441
442    All timestamps are datetime.datetime objects.
443
444    @var suite_job_id: the afe job id of the suite job for which
445                       we are recording the timing for.
446    @var download_start_time: the time the devserver starts staging
447                              the build artifacts. Recorded in create_suite_job.
448    @var payload_end_time: the time when the artifacts only necessary to start
449                           installsing images onto DUT's are staged.
450                           Recorded in create_suite_job.
451    @var artifact_end_time: the remaining artifacts are downloaded after we kick
452                            off the reimaging job, at which point we record
453                            artifact_end_time. Recorded in dynamic_suite.py.
454    @var suite_start_time: the time the suite started.
455    @var tests_start_time: the time the first test started running.
456    @var tests_end_time: the time the last test finished running.
457    """
458
459    def __init__(self, suite_job_id):
460        self.suite_job_id = suite_job_id
461        # Timings related to staging artifacts on devserver.
462        self.download_start_time = None
463        self.payload_end_time = None
464        self.artifact_end_time = None
465
466        # The test_start_time, but taken off the view that corresponds to the
467        # suite instead of an individual test.
468        self.suite_start_time = None
469
470        # Earliest and Latest tests in the set of TestViews passed to us.
471        self.tests_start_time = None
472        self.tests_end_time = None
473
474
475    def RecordTiming(self, view):
476        """Given a test report view, extract and record pertinent time info.
477
478        get_detailed_test_views() returns a list of entries that provide
479        info about the various parts of a suite run.  This method can take
480        any one of these entries and look up timestamp info we might want
481        and record it.
482
483        If timestamps are unavailable, datetime.datetime.min/max will be used.
484
485        @param view: A TestView object.
486        """
487        start_candidate = datetime.min
488        end_candidate = datetime.max
489        if view['test_started_time']:
490            start_candidate = time_utils.time_string_to_datetime(
491                    view['test_started_time'])
492        if view['test_finished_time']:
493            end_candidate = time_utils.time_string_to_datetime(
494                    view['test_finished_time'])
495
496        if view.get_testname() == TestView.SUITE_PREP:
497            self.suite_start_time = start_candidate
498        else:
499            self._UpdateFirstTestStartTime(start_candidate)
500            self._UpdateLastTestEndTime(end_candidate)
501        if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view:
502            keyvals = view['job_keyvals']
503            self.download_start_time = time_utils.time_string_to_datetime(
504                    keyvals.get(constants.DOWNLOAD_STARTED_TIME),
505                    handle_type_error=True)
506
507            self.payload_end_time = time_utils.time_string_to_datetime(
508                    keyvals.get(constants.PAYLOAD_FINISHED_TIME),
509                    handle_type_error=True)
510
511            self.artifact_end_time = time_utils.time_string_to_datetime(
512                    keyvals.get(constants.ARTIFACT_FINISHED_TIME),
513                    handle_type_error=True)
514
515
516    def _UpdateFirstTestStartTime(self, candidate):
517        """Update self.tests_start_time, iff candidate is an earlier time.
518
519        @param candidate: a datetime.datetime object.
520        """
521        if not self.tests_start_time or candidate < self.tests_start_time:
522            self.tests_start_time = candidate
523
524
525    def _UpdateLastTestEndTime(self, candidate):
526        """Update self.tests_end_time, iff candidate is a later time.
527
528        @param candidate: a datetime.datetime object.
529        """
530        if not self.tests_end_time or candidate > self.tests_end_time:
531            self.tests_end_time = candidate
532
533
534    def __str__(self):
535        return ('\n'
536                'Suite timings:\n'
537                'Downloads started at %s\n'
538                'Payload downloads ended at %s\n'
539                'Suite started at %s\n'
540                'Artifact downloads ended (at latest) at %s\n'
541                'Testing started at %s\n'
542                'Testing ended at %s\n' % (self.download_start_time,
543                                           self.payload_end_time,
544                                           self.suite_start_time,
545                                           self.artifact_end_time,
546                                           self.tests_start_time,
547                                           self.tests_end_time))
548
549
550    def SendResultsToStatsd(self, suite, build, board):
551        """
552        Sends data to statsd.
553
554        1. Makes a data_key of the form: run_suite.$board.$branch.$suite
555            eg: stats/gauges/<hostname>/run_suite/<board>/<branch>/<suite>/
556        2. Computes timings for several start and end event pairs.
557        3. Sends all timing values to statsd.
558
559        @param suite: scheduled suite that we want to record the results of.
560        @param build: the build that this suite ran on.
561                      eg: 'lumpy-release/R26-3570.0.0'
562        @param board: the board that this suite ran on.
563        """
564        if sys.version_info < (2, 7):
565            logging.error('Sending run_suite perf data to statsd requires'
566                          'python 2.7 or greater.')
567            return
568
569        # Constructs the key used for logging statsd timing data.
570        data_key = utils.get_data_key('run_suite', suite, build, board)
571
572        # Since we don't want to try subtracting corrupted datetime values
573        # we catch TypeErrors in time_utils.time_string_to_datetime and insert
574        # None instead. This means that even if, say,
575        # keyvals.get(constants.ARTIFACT_FINISHED_TIME) returns a corrupt
576        # value the member artifact_end_time is set to None.
577        if self.download_start_time:
578            if self.payload_end_time:
579                autotest_stats.Timer(data_key).send('payload_download_time',
580                        (self.payload_end_time -
581                         self.download_start_time).total_seconds())
582
583            if self.artifact_end_time:
584                autotest_stats.Timer(data_key).send('artifact_download_time',
585                        (self.artifact_end_time -
586                         self.download_start_time).total_seconds())
587
588        if self.tests_end_time:
589            if self.suite_start_time:
590                autotest_stats.Timer(data_key).send('suite_run_time',
591                        (self.tests_end_time -
592                         self.suite_start_time).total_seconds())
593
594            if self.tests_start_time:
595                autotest_stats.Timer(data_key).send('tests_run_time',
596                        (self.tests_end_time -
597                         self.tests_start_time).total_seconds())
598
599
600_DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value(
601        'SERVER', 'hostname', type=str)
602
603
604def instance_for_pool(pool_name):
605    """
606    Return the hostname of the server that should be used to service a suite
607    for the specified pool.
608
609    @param pool_name: The pool (without 'pool:' to schedule the suite against.
610    @return: The correct host that should be used to service this suite run.
611    """
612    return CONFIG.get_config_value(
613            'POOL_INSTANCE_SHARDING', pool_name,
614            default=_DEFAULT_AUTOTEST_INSTANCE)
615
616
617class TestView(object):
618    """Represents a test view and provides a set of helper functions."""
619
620
621    SUITE_PREP = 'Suite prep'
622    INFRA_TESTS = ['provision']
623
624
625    def __init__(self, view, afe_job, suite_name, build, user,
626                 solo_test_run=False):
627        """Init a TestView object representing a tko test view.
628
629        @param view: A dictionary representing a tko test view.
630        @param afe_job: An instance of frontend.afe.models.Job
631                        representing the job that kicked off the test.
632        @param suite_name: The name of the suite
633                           that the test belongs to.
634        @param build: The build for which the test is run.
635        @param user: The user for which the test is run.
636        @param solo_test_run: This is a solo test run not part of a suite.
637        """
638        self.view = view
639        self.afe_job = afe_job
640        self.suite_name = suite_name
641        self.build = build
642        self.is_suite_view = afe_job.parent_job is None and not solo_test_run
643        # This is the test name that will be shown in the output.
644        self.testname = None
645        self.user = user
646
647        # The case that a job was aborted before it got a chance to run
648        # usually indicates suite has timed out (unless aborted by user).
649        # In this case, the abort reason will be None.
650        # Update the reason with proper information.
651        if (self.is_relevant_suite_view() and
652                not self.get_testname() == self.SUITE_PREP and
653                self.view['status'] == 'ABORT' and
654                not self.view['reason']):
655            self.view['reason'] = 'Timed out, did not run.'
656
657
658    def __getitem__(self, key):
659        """Overload __getitem__ so that we can still use []
660
661        @param key: A key of the tko test view.
662
663        @returns: The value of an attribute in the view.
664
665        """
666        return self.view[key]
667
668
669    def __iter__(self):
670        """Overload __iter__ so that it supports 'in' operator."""
671        return iter(self.view)
672
673
674    def get_testname(self):
675        """Get test name that should be shown in the output.
676
677        Formalize the test_name we got from the test view.
678
679        Remove 'build/suite' prefix if any. And append 'experimental' prefix
680        for experimental tests if their names do not start with 'experimental'.
681
682        If one runs a test in control file via the following code,
683           job.runtest('my_Test', tag='tag')
684        for most of the cases, view['test_name'] would look like 'my_Test.tag'.
685        If this is the case, this method will just return the original
686        test name, i.e. 'my_Test.tag'.
687
688        There are four special cases.
689        1) A test view is for the suite job's SERVER_JOB.
690           In this case, this method will return 'Suite prep'.
691
692        2) A test view is of a child job or a solo test run not part of a
693           suite, and for a SERVER_JOB or CLIENT_JOB.
694           In this case, we will take the job name, remove the build/suite
695           prefix from the job name, and append the rest to 'SERVER_JOB'
696           or 'CLIENT_JOB' as a prefix. So the names returned by this
697           method will look like:
698             'experimental_Telemetry Smoothness Measurement_SERVER_JOB'
699             'experimental_dummy_Pass_SERVER_JOB'
700             'dummy_Fail_SERVER_JOB'
701
702        3) A test view is of a suite job and its status is ABORT.
703           In this case, the view['test_name'] is the child job's name.
704           If it is an experimental test, 'experimental' will be part
705           of the name. For instance,
706             'lumpy-release/R35-5712.0.0/perf_v2/
707                   experimental_Telemetry Smoothness Measurement'
708             'lumpy-release/R35-5712.0.0/dummy/experimental_dummy_Pass'
709             'lumpy-release/R35-5712.0.0/dummy/dummy_Fail'
710           The above names will be converted to the following:
711             'experimental_Telemetry Smoothness Measurement'
712             'experimental_dummy_Pass'
713             'dummy_Fail'
714
715        4) A test view's status is of a suite job and its status is TEST_NA.
716           In this case, the view['test_name'] is the NAME field of the control
717           file. If it is an experimental test, 'experimental' will part of
718           the name. For instance,
719             'experimental_Telemetry Smoothness Measurement'
720             'experimental_dummy_Pass'
721             'dummy_Fail'
722           This method will not modify these names.
723
724        @returns: Test name after normalization.
725
726        """
727        if self.testname is not None:
728            return self.testname
729
730        if (self.is_suite_view and
731                self.view['test_name'].startswith('SERVER_JOB')):
732            # Rename suite job's SERVER_JOB to 'Suite prep'.
733            self.testname = self.SUITE_PREP
734            return self.testname
735
736        if (self.view['test_name'].startswith('SERVER_JOB') or
737                self.view['test_name'].startswith('CLIENT_JOB')):
738            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
739            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
740        else:
741            testname = self.view['test_name']
742        experimental =  self.is_experimental()
743        # Remove the build and suite name from testname if any.
744        testname = tools.get_test_name(
745                self.build, self.suite_name, testname)
746        # If an experimental test was aborted, testname
747        # would include the 'experimental' prefix already.
748        prefix = constants.EXPERIMENTAL_PREFIX if (
749                experimental and not
750                testname.startswith(constants.EXPERIMENTAL_PREFIX)) else ''
751        self.testname = prefix + testname
752        return self.testname
753
754
755    def is_relevant_suite_view(self):
756        """Checks whether this is a suite view we should care about.
757
758        @returns: True if it is relevant. False otherwise.
759        """
760        return (self.get_testname() == self.SUITE_PREP or
761                (self.is_suite_view and
762                    not self.view['test_name'].startswith('CLIENT_JOB') and
763                    not self.view['subdir']))
764
765
766    def is_test(self):
767        """Return whether the view is for an actual test.
768
769        @returns True if the view is for an actual test.
770                 False if the view is for SERVER_JOB or CLIENT_JOB.
771
772        """
773        return not (self.view['test_name'].startswith('SERVER_JOB') or
774                self.view['test_name'].startswith('CLIENT_JOB'))
775
776
777    def is_retry(self):
778        """Check whether the view is for a retry.
779
780        @returns: True, if the view is for a retry; False otherwise.
781
782        """
783        return self.view['job_keyvals'].get('retry_original_job_id') is not None
784
785
786    def is_experimental(self):
787        """Check whether a test view is for an experimental test.
788
789        @returns: True if it is for an experimental test, False otherwise.
790
791        """
792        return (self.view['job_keyvals'].get('experimental') == 'True' or
793                tools.get_test_name(self.build, self.suite_name,
794                        self.view['test_name']).startswith('experimental'))
795
796
797    def hit_timeout(self):
798        """Check whether the corresponding job has hit its own timeout.
799
800        Note this method should not be called for those test views
801        that belongs to a suite job and are determined as irrelevant
802        by is_relevant_suite_view.  This is because they are associated
803        to the suite job, whose job start/finished time make no sense
804        to an irrelevant test view.
805
806        @returns: True if the corresponding afe job has hit timeout.
807                  False otherwise.
808        """
809        if (self.is_relevant_suite_view() and
810                self.get_testname() != self.SUITE_PREP):
811            # Any relevant suite test view except SUITE_PREP
812            # did not hit its own timeout because it was not ever run.
813            return False
814        start = (datetime.strptime(
815                self.view['job_started_time'], time_utils.TIME_FMT)
816                if self.view['job_started_time'] else None)
817        end = (datetime.strptime(
818                self.view['job_finished_time'], time_utils.TIME_FMT)
819                if self.view['job_finished_time'] else None)
820        if not start or not end:
821            return False
822        else:
823            return ((end - start).total_seconds()/60.0
824                        > self.afe_job.max_runtime_mins)
825
826
827    def is_aborted(self):
828        """Check if the view was aborted.
829
830        For suite prep and child job test views, we check job keyval
831        'aborted_by' and test status.
832
833        For relevant suite job test views, we only check test status
834        because the suite job keyval won't make sense to individual
835        test views.
836
837        @returns: True if the test was as aborted, False otherwise.
838
839        """
840
841        if (self.is_relevant_suite_view() and
842                self.get_testname() != self.SUITE_PREP):
843            return self.view['status'] == 'ABORT'
844        else:
845            return (bool(self.view['job_keyvals'].get('aborted_by')) and
846                    self.view['status'] in ['ABORT', 'RUNNING'])
847
848
849    def is_in_fail_status(self):
850        """Check if the given test's status corresponds to a failure.
851
852        @returns: True if the test's status is FAIL or ERROR. False otherwise.
853
854        """
855        # All the statuses tests can have when they fail.
856        return self.view['status'] in ['FAIL', 'ERROR', 'ABORT']
857
858
859    def is_infra_test(self):
860        """Check whether this is a test that only lab infra is concerned.
861
862        @returns: True if only lab infra is concerned, False otherwise.
863
864        """
865        return self.get_testname() in self.INFRA_TESTS
866
867
868    def get_buildbot_link_reason(self):
869        """Generate the buildbot link reason for the test.
870
871        @returns: A string representing the reason.
872
873        """
874        return ('%s: %s' % (self.view['status'], self.view['reason'])
875                if self.view['reason'] else self.view['status'])
876
877
878    def get_job_id_owner_str(self):
879        """Generate the job_id_owner string for a test.
880
881        @returns: A string which looks like 135036-username
882
883        """
884        return '%s-%s' % (self.view['afe_job_id'], self.user)
885
886
887    def get_bug_info(self, suite_job_keyvals):
888        """Get the bug info from suite_job_keyvals.
889
890        If a bug has been filed for the test, its bug info (bug id and counts)
891        will be stored in the suite job's keyvals. This method attempts to
892        retrieve bug info of the test from |suite_job_keyvals|. It will return
893        None if no bug info is found. No need to check bug info if the view is
894        SUITE_PREP.
895
896        @param suite_job_keyvals: The job keyval dictionary of the suite job.
897                All the bug info about child jobs are stored in
898                suite job's keyvals.
899
900        @returns: None if there is no bug info, or a pair with the
901                  id of the bug, and the count of the number of
902                  times the bug has been seen.
903
904        """
905        if self.get_testname() == self.SUITE_PREP:
906            return None
907        if (self.view['test_name'].startswith('SERVER_JOB') or
908                self.view['test_name'].startswith('CLIENT_JOB')):
909            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
910            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
911        else:
912            testname = self.view['test_name']
913
914        return tools.get_test_failure_bug_info(
915                suite_job_keyvals, self.view['afe_job_id'],
916                testname)
917
918
919    def should_display_buildbot_link(self):
920        """Check whether a buildbot link should show for this view.
921
922        For suite prep view, show buildbot link if it fails.
923        For normal test view,
924            show buildbot link if it is a retry
925            show buildbot link if it hits its own timeout.
926            show buildbot link if it fails. This doesn't
927            include the case where it was aborted but has
928            not hit its own timeout (most likely it was aborted because
929            suite has timed out).
930
931        @returns: True if we should show the buildbot link.
932                  False otherwise.
933        """
934        is_bad_status = (self.view['status'] != 'GOOD' and
935                         self.view['status'] != 'TEST_NA')
936        if self.get_testname() == self.SUITE_PREP:
937            return is_bad_status
938        else:
939            if self.is_retry():
940                return True
941            if is_bad_status:
942                return not self.is_aborted() or self.hit_timeout()
943
944
945    def get_control_file_attributes(self):
946        """Get the attributes from the control file of the test.
947
948        @returns: A list of test attribute or None.
949        """
950        control_file = self.afe_job.control_file
951        attributes = None
952        if control_file:
953            cd = control_data.parse_control_string(control_file)
954            attributes = list(cd.attributes)
955        return attributes
956
957
958class ResultCollector(object):
959    """Collect test results of a suite or a single test run.
960
961    Once a suite job has finished, use this class to collect test results.
962    `run` is the core method that is to be called first. Then the caller
963    could retrieve information like return code, return message, is_aborted,
964    and timings by accessing the collector's public attributes. And output
965    the test results and links by calling the 'output_*' methods.
966
967    Here is a overview of what `run` method does.
968
969    1) Collect the suite job's results from tko_test_view_2.
970    For the suite job, we only pull test views without a 'subdir'.
971    A NULL subdir indicates that the test was _not_ executed. This could be
972    that no child job was scheduled for this test or the child job got
973    aborted before starts running.
974    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
975
976    2) Collect the child jobs' results from tko_test_view_2.
977    For child jobs, we pull all the test views associated with them.
978    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled speically)
979
980    3) Generate web and buildbot links.
981    4) Compute timings of the suite run.
982    5) Compute the return code based on test results.
983
984    @var _instance_server: The hostname of the server that is used
985                           to service the suite.
986    @var _afe: The afe rpc client.
987    @var _tko: The tko rpc client.
988    @var _build: The build for which the suite is run,
989                 e.g. 'lumpy-release/R35-5712.0.0'
990    @var _board: The target board for which the suite is run,
991                 e.g., 'lumpy', 'link'.
992    @var _suite_name: The suite name, e.g. 'bvt', 'dummy'.
993    @var _suite_job_id: The job id of the suite for which we are going to
994                        collect results.
995    @var _original_suite_name: The suite name we record timing would be
996                               different from _suite_name when running
997                               suite_attr_wrapper.
998    @var _suite_views: A list of TestView objects, representing relevant
999                       test views of the suite job.
1000    @var _child_views: A list of TestView objects, representing test views
1001                       of the child jobs.
1002    @var _test_views: A list of TestView objects, representing all test views
1003                      from _suite_views and _child_views.
1004    @var _web_links: A list of web links pointing to the results of jobs.
1005    @var _buildbot_links: A list of buildbot links for non-passing tests.
1006    @var _max_testname_width: Max width of all test names.
1007    @var _solo_test_run: True if this is a single test run.
1008    @var return_code: The exit code that should be returned by run_suite.
1009    @var return_message: Any message that should be displayed to explain
1010                         the return code.
1011    @var is_aborted: Whether the suite was aborted or not.
1012                     True, False or None (aborting status is unknown yet)
1013    @var timings: A Timing object that records the suite's timings.
1014
1015    """
1016
1017
1018    def __init__(self, instance_server, afe, tko, build, board,
1019                 suite_name, suite_job_id, original_suite_name=None,
1020                 user=None, solo_test_run=False):
1021        self._instance_server = instance_server
1022        self._afe = afe
1023        self._tko = tko
1024        self._build = build
1025        self._board = board
1026        self._suite_name = suite_name
1027        self._suite_job_id = suite_job_id
1028        self._original_suite_name = original_suite_name or suite_name
1029        self._suite_views = []
1030        self._child_views = []
1031        self._test_views = []
1032        self._retry_counts = {}
1033        self._web_links = []
1034        self._buildbot_links = []
1035        self._max_testname_width = 0
1036        self._num_child_jobs = 0
1037        self.return_code = None
1038        self.return_message = ''
1039        self.is_aborted = None
1040        self.timings = None
1041        self._user = user or getpass.getuser()
1042        self._solo_test_run = solo_test_run
1043
1044
1045    def _fetch_relevant_test_views_of_suite(self):
1046        """Fetch relevant test views of the suite job.
1047
1048        For the suite job, there will be a test view for SERVER_JOB, and views
1049        for results of its child jobs. For example, assume we've ceated
1050        a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail,
1051        dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while
1052        dummy_Path.bluetooth got TEST_NA as no duts have bluetooth.
1053        So the suite job's test views would look like
1054        _____________________________________________________________________
1055        test_idx| job_idx|test_name           |subdir      |afe_job_id|status
1056        10      | 1000   |SERVER_JOB          |----        |40        |GOOD
1057        11      | 1000   |dummy_Pass          |NULL        |40        |ABORT
1058        12      | 1000   |dummy_Fail.Fail     |41-onwer/...|40        |FAIL
1059        13      | 1000   |dummy_Fail.Error    |42-owner/...|40        |ERROR
1060        14      | 1000   |dummy_Pass.bluetooth|NULL        |40        |TEST_NA
1061
1062        For a suite job, we only care about
1063        a) The test view for the suite job's SERVER_JOB
1064        b) The test views for real tests without a subdir. A NULL subdir
1065           indicates that a test didn't get executed.
1066        So, for the above example, we only keep test views whose test_idxs
1067        are 10, 11, 14.
1068
1069        @returns: A list of TestView objects, representing relevant
1070                  test views of the suite job.
1071
1072        """
1073        suite_job = self._afe.get_jobs(id=self._suite_job_id)[0]
1074        views = self._tko.run(call='get_detailed_test_views',
1075                              afe_job_id=self._suite_job_id)
1076        relevant_views = []
1077        for v in views:
1078            v = TestView(v, suite_job, self._suite_name, self._build, self._user,
1079                         solo_test_run=self._solo_test_run)
1080            if v.is_relevant_suite_view():
1081                relevant_views.append(v)
1082        return relevant_views
1083
1084
1085    def _compute_retry_count(self, view):
1086        """Return how many times the test has been retried.
1087
1088        @param view: A TestView instance.
1089        @returns: An int value indicating the retry count.
1090
1091        """
1092        old_job = view['job_keyvals'].get('retry_original_job_id')
1093        count = 0
1094        while old_job:
1095            count += 1
1096            views = self._tko.run(
1097                call='get_detailed_test_views', afe_job_id=old_job)
1098            old_job = (views[0]['job_keyvals'].get('retry_original_job_id')
1099                       if views else None)
1100        return count
1101
1102
1103    def _fetch_test_views_of_child_jobs(self, jobs=None):
1104        """Fetch test views of child jobs.
1105
1106        @returns: A tuple (child_views, retry_counts)
1107                  child_views is list of TestView objects, representing
1108                  all valid views. retry_counts is a dictionary that maps
1109                  test_idx to retry counts. It only stores retry
1110                  counts that are greater than 0.
1111
1112        """
1113        child_views = []
1114        retry_counts = {}
1115        child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id)
1116        if child_jobs:
1117            self._num_child_jobs = len(child_jobs)
1118        for job in child_jobs:
1119            views = [TestView(v, job, self._suite_name, self._build, self._user)
1120                     for v in self._tko.run(
1121                         call='get_detailed_test_views', afe_job_id=job.id,
1122                         invalid=0)]
1123            contains_test_failure = any(
1124                    v.is_test() and v['status'] != 'GOOD' for v in views)
1125            for v in views:
1126                if (v.is_test() or
1127                        v['status'] != 'GOOD' and not contains_test_failure):
1128                    # For normal test view, just keep it.
1129                    # For SERVER_JOB or CLIENT_JOB, only keep it
1130                    # if it fails and no other test failure.
1131                    child_views.append(v)
1132                    retry_count = self._compute_retry_count(v)
1133                    if retry_count > 0:
1134                        retry_counts[v['test_idx']] = retry_count
1135        return child_views, retry_counts
1136
1137
1138    def _generate_web_and_buildbot_links(self):
1139        """Generate web links and buildbot links."""
1140        # TODO(fdeng): If a job was aborted before it reaches Running
1141        # state, we read the test view from the suite job
1142        # and thus this method generates a link pointing to the
1143        # suite job's page for the aborted job. Need a fix.
1144        self._web_links = []
1145        self._buildbot_links = []
1146        # Bug info are stored in the suite job's keyvals.
1147        if self._solo_test_run:
1148            suite_job_keyvals = {}
1149        else:
1150            suite_job_keyvals = self._suite_views[0]['job_keyvals']
1151        for v in self._test_views:
1152            retry_count = self._retry_counts.get(v['test_idx'], 0)
1153            bug_info = v.get_bug_info(suite_job_keyvals)
1154            job_id_owner = v.get_job_id_owner_str()
1155            link = LogLink(
1156                    anchor=v.get_testname().ljust(
1157                            self._max_testname_width),
1158                    server=self._instance_server,
1159                    job_string=job_id_owner,
1160                    bug_info=bug_info, retry_count=retry_count,
1161                    testname=v.get_testname())
1162            self._web_links.append(link)
1163
1164            if v.should_display_buildbot_link():
1165                link.reason = v.get_buildbot_link_reason()
1166                self._buildbot_links.append(link)
1167
1168
1169    def _record_timings(self):
1170        """Record suite timings."""
1171        self.timings = Timings(self._suite_job_id)
1172        for v in self._test_views:
1173            self.timings.RecordTiming(v)
1174
1175
1176    def _get_return_msg(self, code, tests_passed_after_retry):
1177        """Return the proper message for a given return code.
1178
1179        @param code: An enum value of RETURN_CODES
1180        @param test_passed_after_retry: True/False, indicating
1181            whether there are test(s) that have passed after retry.
1182
1183        @returns: A string, representing the message.
1184
1185        """
1186        if code == RETURN_CODES.INFRA_FAILURE:
1187            return 'Suite job failed or provisioning failed.'
1188        elif code == RETURN_CODES.SUITE_TIMEOUT:
1189            return ('Some test(s) was aborted before running,'
1190                    ' suite must have timed out.')
1191        elif code == RETURN_CODES.WARNING:
1192            if tests_passed_after_retry:
1193                return 'Some test(s) passed after retry.'
1194            else:
1195                return 'Some test(s) raised a warning.'
1196        elif code == RETURN_CODES.ERROR:
1197            return 'Some test(s) failed.'
1198        else:
1199            return ''
1200
1201
1202    def _compute_return_code(self):
1203        """Compute the exit code based on test results."""
1204        code = RETURN_CODES.OK
1205        tests_passed_after_retry = False
1206
1207        for v in self._test_views:
1208            # The order of checking each case is important.
1209            if v.is_experimental():
1210                continue
1211            if v.get_testname() == TestView.SUITE_PREP:
1212                if v.is_aborted() and v.hit_timeout():
1213                    current_code = RETURN_CODES.SUITE_TIMEOUT
1214                elif v.is_in_fail_status():
1215                    current_code = RETURN_CODES.INFRA_FAILURE
1216                elif v['status'] == 'WARN':
1217                    current_code = RETURN_CODES.WARNING
1218                else:
1219                    current_code = RETURN_CODES.OK
1220            else:
1221                if v.is_aborted() and v.is_relevant_suite_view():
1222                    # The test was aborted before started
1223                    # This gurantees that the suite has timed out.
1224                    current_code = RETURN_CODES.SUITE_TIMEOUT
1225                elif v.is_aborted() and not v.hit_timeout():
1226                    # The test was aborted, but
1227                    # not due to a timeout. This is most likely
1228                    # because the suite has timed out, but may
1229                    # also because it was aborted by the user.
1230                    # Since suite timing out is determined by checking
1231                    # the suite prep view, we simply ignore this view here.
1232                    current_code = RETURN_CODES.OK
1233                elif v.is_in_fail_status():
1234                    # The test job failed.
1235                    if v.is_infra_test():
1236                        current_code = RETURN_CODES.INFRA_FAILURE
1237                    else:
1238                        current_code = RETURN_CODES.ERROR
1239                elif v['status'] == 'WARN':
1240                    # The test/suite job raised a wanrning.
1241                    current_code = RETURN_CODES.WARNING
1242                elif v.is_retry():
1243                    # The test is a passing retry.
1244                    current_code = RETURN_CODES.WARNING
1245                    tests_passed_after_retry = True
1246                else:
1247                    current_code = RETURN_CODES.OK
1248            code = get_worse_code(code, current_code)
1249
1250        self.return_code = code
1251        self.return_message = self._get_return_msg(
1252                code, tests_passed_after_retry)
1253
1254
1255    def output_results(self):
1256        """Output test results, timings and web links."""
1257        # Output test results
1258        for v in self._test_views:
1259            display_name = v.get_testname().ljust(self._max_testname_width)
1260            logging.info('%s%s', display_name,
1261                         get_pretty_status(v['status']))
1262            if v['status'] != 'GOOD':
1263                logging.info('%s  %s: %s', display_name, v['status'],
1264                             v['reason'])
1265            if v.is_retry():
1266                retry_count = self._retry_counts.get(v['test_idx'], 0)
1267                logging.info('%s  retry_count: %s',
1268                             display_name, retry_count)
1269        # Output suite timings
1270        logging.info(self.timings)
1271        # Output links to test logs
1272        logging.info('\nLinks to test logs:')
1273        for link in self._web_links:
1274            logging.info(link.GenerateTextLink())
1275        logging.info('\n')
1276
1277
1278    def get_results_dict(self):
1279        """Write test results, timings and web links into a dict.
1280
1281        @returns: A dict of results in the format like:
1282                  {
1283                  'tests': {
1284                        'test_1': {'status': 'PASSED', 'attributes': [1,2], ...}
1285                        'test_2': {'status': 'FAILED', 'attributes': [1],...}
1286                  }
1287                  'suite_timings': {
1288                        'download_start': '1998-07-17 00:00:00',
1289                        'payload_download_end': '1998-07-17 00:00:05',
1290                        ...
1291                  }
1292                  }
1293        """
1294        output_dict = {}
1295        tests_dict = output_dict.setdefault('tests', {})
1296        for v in self._test_views:
1297          test_name = v.get_testname()
1298          test_info = tests_dict.setdefault(test_name, {})
1299          test_info.update({
1300              'status': v['status'],
1301              'attributes': v.get_control_file_attributes() or list(),
1302              'reason': v['reason'],
1303              'retry_count': self._retry_counts.get(v['test_idx'], 0),
1304              })
1305
1306        # Write the links to test logs into the |tests_dict| of |output_dict|.
1307        # For test whose status is not 'GOOD', the link is also buildbot_link.
1308        for link in self._web_links:
1309          test_name = link.anchor.strip()
1310          test_info = tests_dict.get(test_name)
1311          if test_info:
1312            test_info['link_to_logs'] = link.url
1313            # Write the wmatrix link into the dict.
1314            if link in self._buildbot_links and link.testname:
1315              test_info['wmatrix_link'] = WMATRIX_RETRY_URL % link.testname
1316            # Write the bug url into the dict.
1317            if link.bug_id:
1318              test_info['bug_url'] = link.get_bug_link(link.bug_id)
1319
1320        # Write the suite timings into |output_dict|
1321        time_dict = output_dict.setdefault('suite_timings', {})
1322        time_dict.update({
1323            'download_start' : str(self.timings.download_start_time),
1324            'payload_download_end' : str(self.timings.payload_end_time),
1325            'suite_start' : str(self.timings.suite_start_time),
1326            'artifact_download_end' : str(self.timings.artifact_end_time),
1327            'tests_start' : str(self.timings.tests_start_time),
1328            'tests_end' : str(self.timings.tests_end_time),
1329            })
1330
1331        output_dict['suite_job_id'] = self._suite_job_id
1332
1333        return output_dict
1334
1335
1336    def output_buildbot_links(self):
1337        """Output buildbot links."""
1338        for link in self._buildbot_links:
1339            logging.info(link.GenerateBuildbotLink())
1340            wmatrix_link = link.GenerateWmatrixRetryLink()
1341            if wmatrix_link:
1342                logging.info(wmatrix_link)
1343
1344
1345    def run(self):
1346        """Collect test results.
1347
1348        This method goes through the following steps:
1349            Fetch relevent test views of the suite job.
1350            Fetch test views of child jobs
1351            Check whether the suite was aborted.
1352            Generate links.
1353            Calculate suite timings.
1354            Compute return code based on the test result.
1355
1356        """
1357        if self._solo_test_run:
1358            self._test_views, self.retry_count = (
1359                  self._fetch_test_views_of_child_jobs(
1360                          jobs=self._afe.get_jobs(id=self._suite_job_id)))
1361        else:
1362            self._suite_views = self._fetch_relevant_test_views_of_suite()
1363            self._child_views, self._retry_counts = (
1364                    self._fetch_test_views_of_child_jobs())
1365            self._test_views = self._suite_views + self._child_views
1366        # For hostless job in Starting status, there is no test view associated.
1367        # This can happen when a suite job in Starting status is aborted. When
1368        # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone,
1369        # max_jobs_started_per_cycle, a suite job can stays in Starting status.
1370        if not self._test_views:
1371            self.return_code = RETURN_CODES.INFRA_FAILURE
1372            self.return_message = 'No test view was found.'
1373            return
1374        self.is_aborted = any([view['job_keyvals'].get('aborted_by')
1375                               for view in self._suite_views])
1376        self._max_testname_width = max(
1377                [len(v.get_testname()) for v in self._test_views]) + 3
1378        self._generate_web_and_buildbot_links()
1379        self._record_timings()
1380        self._compute_return_code()
1381
1382
1383    def gather_timing_stats(self):
1384        """Collect timing related statistics."""
1385        # Send timings to statsd.
1386        self.timings.SendResultsToStatsd(
1387                self._original_suite_name, self._build, self._board)
1388
1389        # Record suite runtime in metadata db.
1390        # Some failure modes can leave times unassigned, report sentinel value
1391        # in that case.
1392        runtime_in_secs = -1
1393        if (self.timings.tests_end_time is not None and
1394            self.timings.suite_start_time is not None):
1395            runtime_in_secs = (self.timings.tests_end_time -
1396                    self.timings.suite_start_time).total_seconds()
1397
1398        job_overhead.record_suite_runtime(self._suite_job_id, self._suite_name,
1399                self._board, self._build, self._num_child_jobs, runtime_in_secs)
1400
1401
1402@retry.retry(error.StageControlFileFailure, timeout_min=10)
1403def create_suite(afe, options):
1404    """Create a suite with retries.
1405
1406    @param afe: The afe object to insert the new suite job into.
1407    @param options: The options to use in creating the suite.
1408
1409    @return: The afe_job_id of the new suite job.
1410    """
1411    builds = {}
1412    if options.build:
1413        if re.match(ANDROID_BUILD_REGEX, options.build):
1414            builds[provision.ANDROID_BUILD_VERSION_PREFIX] = options.build
1415        else:
1416            builds[provision.CROS_VERSION_PREFIX] = options.build
1417    if options.firmware_rw_build:
1418        builds[provision.FW_RW_VERSION_PREFIX] = options.firmware_rw_build
1419    if options.firmware_ro_build:
1420        builds[provision.FW_RO_VERSION_PREFIX] = options.firmware_ro_build
1421    wait = options.no_wait == 'False'
1422    file_bugs = options.file_bugs == 'True'
1423    retry = options.retry == 'True'
1424    offload_failures_only = options.offload_failures_only == 'True'
1425    try:
1426        priority = int(options.priority)
1427    except ValueError:
1428        try:
1429            priority = priorities.Priority.get_value(options.priority)
1430        except AttributeError:
1431            print 'Unknown priority level %s.  Try one of %s.' % (
1432                  options.priority, ', '.join(priorities.Priority.names))
1433            raise
1434    logging.info('%s Submitted create_suite_job rpc',
1435                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1436    return afe.run('create_suite_job', name=options.name,
1437                   board=options.board, build=options.build,
1438                   builds=builds, test_source_build=options.test_source_build,
1439                   check_hosts=wait, pool=options.pool,
1440                   num=options.num,
1441                   file_bugs=file_bugs, priority=priority,
1442                   suite_args=options.suite_args,
1443                   wait_for_results=wait,
1444                   timeout_mins=options.timeout_mins,
1445                   max_runtime_mins=options.max_runtime_mins,
1446                   job_retry=retry, max_retries=options.max_retries,
1447                   suite_min_duts=options.suite_min_duts,
1448                   offload_failures_only=offload_failures_only,
1449                   run_prod_code=options.run_prod_code)
1450
1451
1452def main_without_exception_handling(options):
1453    """
1454    run_suite script without exception handling.
1455
1456    @param options: The parsed options.
1457
1458    @returns: A tuple contains the return_code of run_suite and the dictionary
1459              of the output.
1460
1461    """
1462    # If indicate to use the new style suite control file, convert the args
1463    if options.use_suite_attr:
1464        options = change_options_for_suite_attr(options)
1465
1466    log_name = 'run_suite-default.log'
1467    if options.build:
1468        # convert build name from containing / to containing only _
1469        log_name = 'run_suite-%s.log' % options.build.replace('/', '_')
1470        log_dir = os.path.join(common.autotest_dir, 'logs')
1471        if os.path.exists(log_dir):
1472            log_name = os.path.join(log_dir, log_name)
1473
1474    utils.setup_logging(logfile=log_name)
1475
1476    if not options.bypass_labstatus:
1477        utils.check_lab_status(options.build)
1478    instance_server = (options.web if options.web else
1479                       instance_for_pool(options.pool))
1480    afe = frontend_wrappers.RetryingAFE(server=instance_server,
1481                                        timeout_min=options.afe_timeout_mins,
1482                                        delay_sec=options.delay_sec)
1483    logging.info('Autotest instance: %s', instance_server)
1484
1485    rpc_helper = diagnosis_utils.RPCHelper(afe)
1486    is_real_time = True
1487    if options.mock_job_id:
1488        job_id = int(options.mock_job_id)
1489        existing_job = afe.get_jobs(id=job_id, finished=True)
1490        if existing_job:
1491            is_real_time = False
1492        else:
1493            existing_job = afe.get_jobs(id=job_id)
1494        if existing_job:
1495            job_created_on = time_utils.date_string_to_epoch_time(
1496                    existing_job[0].created_on)
1497        else:
1498            raise utils.TestLabException('Failed to retrieve job: %d' % job_id)
1499    else:
1500        try:
1501            rpc_helper.check_dut_availability(options.board, options.pool,
1502                                              options.minimum_duts)
1503            job_id = create_suite(afe, options)
1504            job_created_on = time.time()
1505        except diagnosis_utils.NotEnoughDutsError:
1506            logging.info(GetBuildbotStepLink(
1507                    'Pool Health Bug', LogLink.get_bug_link(rpc_helper.bug)))
1508            raise
1509        except (error.CrosDynamicSuiteException,
1510                error.RPCException, proxy.JSONRPCException) as e:
1511            logging.warning('Error Message: %s', e)
1512            return (RETURN_CODES.INFRA_FAILURE, {'return_message': e})
1513        except AttributeError:
1514            return (RETURN_CODES.INVALID_OPTIONS, {})
1515
1516    job_timer = diagnosis_utils.JobTimer(
1517            job_created_on, float(options.timeout_mins))
1518    job_url = reporting_utils.link_job(job_id,
1519                                       instance_server=instance_server)
1520    logging.info('%s Created suite job: %s',
1521                 job_timer.format_time(job_timer.job_created_time),
1522                 job_url)
1523    # TODO(akeshet): Move this link-printing to chromite.
1524    logging.info(GetBuildbotStepLink('Suite created', job_url))
1525
1526    if options.create_and_return:
1527        msg = '--create_and_return was specified, terminating now.'
1528        logging.info(msg)
1529        return (RETURN_CODES.OK, {'return_message':msg})
1530
1531    TKO = frontend_wrappers.RetryingTKO(server=instance_server,
1532                                        timeout_min=options.afe_timeout_mins,
1533                                        delay_sec=options.delay_sec)
1534    code = RETURN_CODES.OK
1535    wait = options.no_wait == 'False'
1536    output_dict = {}
1537    if wait:
1538        while not afe.get_jobs(id=job_id, finished=True):
1539            # Note that this call logs output, preventing buildbot's
1540            # 9000 second silent timeout from kicking in. Let there be no
1541            # doubt, this is a hack. The timeout is from upstream buildbot and
1542            # this is the easiest work around.
1543            if job_timer.first_past_halftime():
1544                rpc_helper.diagnose_job(job_id, instance_server)
1545            if job_timer.debug_output_timer.poll():
1546                logging.info('The suite job has another %s till timeout.',
1547                             job_timer.timeout_hours - job_timer.elapsed_time())
1548            time.sleep(10)
1549        # For most cases, ResultCollector should be able to determine whether
1550        # a suite has timed out by checking information in the test view.
1551        # However, occationally tko parser may fail on parsing the
1552        # job_finished time from the job's keyval file. So we add another
1553        # layer of timeout check in run_suite. We do the check right after
1554        # the suite finishes to make it as accurate as possible.
1555        # There is a minor race condition here where we might have aborted
1556        # for some reason other than a timeout, and the job_timer thinks
1557        # it's a timeout because of the jitter in waiting for results.
1558        # The consequence would be that run_suite exits with code
1559        # SUITE_TIMEOUT while it should  have returned INFRA_FAILURE
1560        # instead, which should happen very rarely.
1561        # Note the timeout will have no sense when using -m option.
1562        is_suite_timeout = job_timer.is_suite_timeout()
1563
1564        # Extract the original suite name to record timing.
1565        original_suite_name = get_original_suite_name(options.name,
1566                                                      options.suite_args)
1567        # Start collecting test results.
1568        collector = ResultCollector(instance_server=instance_server,
1569                                    afe=afe, tko=TKO, build=options.build,
1570                                    board=options.board,
1571                                    suite_name=options.name,
1572                                    suite_job_id=job_id,
1573                                    original_suite_name=original_suite_name)
1574        collector.run()
1575        # Dump test outputs into json.
1576        output_dict = collector.get_results_dict()
1577        output_dict['autotest_instance'] = instance_server
1578        if not options.json_dump:
1579          collector.output_results()
1580        code = collector.return_code
1581        return_message = collector.return_message
1582        if is_real_time:
1583            # Do not record stats if the suite was aborted (either by a user
1584            # or through the golo rpc).
1585            # Also do not record stats if is_aborted is None, indicating
1586            # aborting status is unknown yet.
1587            if collector.is_aborted == False:
1588                collector.gather_timing_stats()
1589
1590            if collector.is_aborted == True and is_suite_timeout:
1591                # There are two possible cases when a suite times out.
1592                # 1. the suite job was aborted due to timing out
1593                # 2. the suite job succeeded, but some child jobs
1594                #    were already aborted before the suite job exited.
1595                # The case 2 was handled by ResultCollector,
1596                # here we handle case 1.
1597                old_code = code
1598                code = get_worse_code(
1599                        code, RETURN_CODES.SUITE_TIMEOUT)
1600                if old_code != code:
1601                    return_message = 'Suite job timed out.'
1602                    logging.info('Upgrade return code from %s to %s '
1603                                 'because suite job has timed out.',
1604                                 RETURN_CODES.get_string(old_code),
1605                                 RETURN_CODES.get_string(code))
1606            if is_suite_timeout:
1607                logging.info('\nAttempting to diagnose pool: %s', options.pool)
1608                try:
1609                    # Add some jitter to make up for any latency in
1610                    # aborting the suite or checking for results.
1611                    cutoff = (job_timer.timeout_hours +
1612                              datetime_base.timedelta(hours=0.3))
1613                    rpc_helper.diagnose_pool(
1614                            options.board, options.pool, cutoff)
1615                except proxy.JSONRPCException as e:
1616                    logging.warning('Unable to diagnose suite abort.')
1617
1618        # And output return message.
1619        if return_message:
1620            logging.info('Reason: %s', return_message)
1621            output_dict['return_message'] = return_message
1622
1623        logging.info('\nOutput below this line is for buildbot consumption:')
1624        collector.output_buildbot_links()
1625    else:
1626        logging.info('Created suite job: %r', job_id)
1627        link = LogLink(options.name, instance_server,
1628                       '%s-%s' % (job_id, getpass.getuser()))
1629        logging.info(link.GenerateBuildbotLink())
1630        output_dict['return_message'] = '--no_wait specified; Exiting.'
1631        logging.info('--no_wait specified; Exiting.')
1632    return (code, output_dict)
1633
1634
1635def main():
1636    """Entry point."""
1637    utils.verify_not_root_user()
1638    code = RETURN_CODES.OK
1639    output_dict = {}
1640
1641    try:
1642        parser, options, args = parse_options()
1643        # Silence the log when dumping outputs into json
1644        if options.json_dump:
1645            logging.disable(logging.CRITICAL)
1646
1647        if not verify_options_and_args(options, args):
1648            parser.print_help()
1649            code = RETURN_CODES.INVALID_OPTIONS
1650        else:
1651            (code, output_dict) = main_without_exception_handling(options)
1652    except diagnosis_utils.BoardNotAvailableError as e:
1653        output_dict['return_message'] = 'Skipping testing: %s' % e.message
1654        code = RETURN_CODES.BOARD_NOT_AVAILABLE
1655        logging.info(output_dict['return_message'])
1656    except utils.TestLabException as e:
1657        output_dict['return_message'] = 'TestLabException: %s' % e
1658        code = RETURN_CODES.INFRA_FAILURE
1659        logging.exception(output_dict['return_message'])
1660    except Exception as e:
1661        output_dict['return_message'] = 'Unhandled run_suite exception: %s' % e
1662        code = RETURN_CODES.INFRA_FAILURE
1663        logging.exception(output_dict['return_message'])
1664
1665    # Dump test outputs into json.
1666    output_dict['return_code'] = code
1667    output_json = json.dumps(output_dict, sort_keys=True)
1668    if options.json_dump:
1669        output_json_marked = '#JSON_START#%s#JSON_END#' % output_json.strip()
1670        sys.stdout.write(output_json_marked)
1671
1672    logging.info('Will return from run_suite with status: %s',
1673                  RETURN_CODES.get_string(code))
1674    autotest_stats.Counter('run_suite.%s' %
1675                           RETURN_CODES.get_string(code)).increment()
1676    return code
1677
1678
1679if __name__ == "__main__":
1680    sys.exit(main())
1681