• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import datetime
6import logging
7import threading
8
9import common
10from autotest_lib.client.common_lib import error
11from autotest_lib.client.common_lib import global_config
12from autotest_lib.server import site_utils
13from autotest_lib.server.cros import provision
14from autotest_lib.server.cros.dynamic_suite import frontend_wrappers, reporting
15
16try:
17    from chromite.lib import metrics
18except ImportError:
19    metrics = site_utils.metrics_mock
20
21
22CONFIG = global_config.global_config
23
24JOB_MAX_RUNTIME_MINS_DEFAULT = CONFIG.get_config_value(
25        'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int, default=72*60)
26
27# Minimum RPC timeout setting for calls expected to take long time, e.g.,
28# create_suite_job. If default socket time (socket.getdefaulttimeout()) is
29# None or greater than this value, the default will be used.
30# The value here is set to be the same as the timeout for the RetryingAFE object
31# so long running RPCs can wait long enough before being aborted.
32_MIN_RPC_TIMEOUT = 600
33
34# Number of days back to search for existing job.
35SEARCH_JOB_MAX_DAYS = 14
36
37# Number of minutes to increase the value of DedupingScheduler.delay_minutes.
38# This allows all suite jobs created in the same event to start provision jobs
39# at different time. 5 minutes allows 40 boards to have provision jobs started
40# with in about 200 minutes. That way, we don't add too much delay on test jobs
41# and do not keep suite jobs running for too long. Note that suite jobs created
42# by suite scheduler does not wait for test job to finish. That helps to reduce
43# the load on drone.
44DELAY_MINUTES_INTERVAL = 5
45# Set maximum delay minutes to 24 hours. This is to prevent suite jobs from
46# running for too long. Nightly and new_build tasks won't create that many
47# suites that need such a long delay. However, weekly tasks can create several
48# hundreds of suites as most of them requires to run on all branches.
49MAX_DELAY_MINUTES = 1440
50
51class DedupingSchedulerException(Exception):
52    """Base class for exceptions from this module."""
53    pass
54
55
56class ScheduleException(DedupingSchedulerException):
57    """Raised when an error is returned from the AFE during scheduling."""
58    pass
59
60
61class DedupException(DedupingSchedulerException):
62    """Raised when an error occurs while checking for duplicate jobs."""
63    pass
64
65
66class DedupingScheduler(object):
67    """A class that will schedule suites to run on a given board, build.
68
69    Includes logic to check whether or not a given (suite, board, build)
70    has already been run.  If so, it will skip scheduling that suite.
71
72    @var _afe: a frontend.AFE instance used to talk to autotest.
73    """
74
75    _SUITE_SCHEDULER_SUITE_COUNT = metrics.Counter(
76            'chromeos/autotest/suite_scheduler/suite/created')
77
78    def __init__(self, afe=None, file_bug=False):
79        """Constructor
80
81        @param afe: an instance of AFE as defined in server/frontend.py.
82                    Defaults to a frontend_wrappers.RetryingAFE instance.
83        """
84        self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
85                                                         delay_sec=10,
86                                                         debug=False)
87        self._file_bug = file_bug
88
89        # Number of minutes to delay a suite job from creating test jobs.
90        self.delay_minutes = 0
91        # Number of minutes to increase of decrease self.delay_minutes. When
92        # self.delay_minutes reaches MAX_DELAY_MINUTES, it should wind down
93        # to allow even distribution of test job creation.
94        self.delay_minutes_interval = DELAY_MINUTES_INTERVAL
95        # Lock to make sure each suite created with different delay_minutes.
96        self._lock = threading.Lock()
97
98
99    def _ShouldScheduleSuite(self, suite, board, test_source_build):
100        """Return True if |suite| has not yet been run for |build| on |board|.
101
102        True if |suite| has not been run for |build| on |board|, and
103        the lab is open for this particular request.  False otherwise.
104
105        @param suite: the name of the suite to run, e.g. 'bvt'
106        @param board: the board to run the suite on, e.g. x86-alex
107        @param test_source_build: Build with the source of tests.
108
109        @return False if the suite was already scheduled, True if not
110        @raise DedupException if the AFE raises while searching for jobs.
111
112        """
113        try:
114            site_utils.check_lab_status(test_source_build)
115        except site_utils.TestLabException as ex:
116            logging.debug('Skipping suite %s, board %s, build %s:  %s',
117                          suite, board, test_source_build, str(ex))
118            return False
119        try:
120            start_time = str(datetime.datetime.now() -
121                             datetime.timedelta(days=SEARCH_JOB_MAX_DAYS))
122            return not self._afe.get_jobs(
123                    name__istartswith=test_source_build,
124                    name__iendswith='control.'+suite,
125                    created_on__gte=start_time,
126                    min_rpc_timeout=_MIN_RPC_TIMEOUT)
127        except Exception as e:
128            raise DedupException(e)
129
130
131    def _Schedule(self, suite, board, build, pool, num, priority, timeout,
132                  file_bugs=False, firmware_rw_build=None,
133                  firmware_ro_build=None, test_source_build=None,
134                  job_retry=False, launch_control_build=None,
135                  run_prod_code=False, testbed_dut_count=None, no_delay=False):
136        """Schedule |suite|, if it hasn't already been run.
137
138        @param suite: the name of the suite to run, e.g. 'bvt'
139        @param board: the board to run the suite on, e.g. x86-alex
140        @param build: the ChromeOS build to install e.g.
141                      x86-alex-release/R18-1655.0.0-a1-b1584.
142        @param pool: the pool of machines to use for scheduling purposes.
143                     Default: None
144        @param num: the number of devices across which to shard the test suite.
145                    Type: integer or None
146                    Default: None (uses sharding factor in global_config.ini).
147        @param priority: One of the values from
148                         client.common_lib.priorities.Priority.
149        @param timeout: The max lifetime of the suite in hours.
150        @param file_bugs: True if bug filing is desired for this suite.
151        @param firmware_rw_build: Firmware build to update RW firmware. Default
152                                  to None.
153        @param firmware_ro_build: Firmware build to update RO firmware. Default
154                                  to None.
155        @param test_source_build: Build that contains the server-side test code.
156                                  Default to None to use the ChromeOS build
157                                  (defined by `build`).
158        @param job_retry: Set to True to enable job-level retry. Default is
159                          False.
160        @param launch_control_build: Name of a Launch Control build, e.g.,
161                                     'git_mnc_release/shamu-eng/123'
162        @param run_prod_code: If True, the suite will run the test code that
163                              lives in prod aka the test code currently on the
164                              lab servers. If False, the control files and test
165                              code for this suite run will be retrieved from the
166                              build artifacts. Default is False.
167        @param testbed_dut_count: Number of duts to test when using a testbed.
168        @param no_delay: Set to True to allow suite to be created without
169                         configuring delay_minutes. Default is False.
170
171        @return True if the suite got scheduled
172        @raise ScheduleException if an error occurs while scheduling.
173
174        """
175        try:
176            if build:
177                builds = {provision.CROS_VERSION_PREFIX: build}
178            if firmware_rw_build:
179                builds[provision.FW_RW_VERSION_PREFIX] = firmware_rw_build
180            if firmware_ro_build:
181                builds[provision.FW_RO_VERSION_PREFIX] = firmware_ro_build
182            if launch_control_build:
183                if testbed_dut_count is None:
184                    builds = {provision.ANDROID_BUILD_VERSION_PREFIX:
185                              launch_control_build}
186                else:
187                    builds = {provision.TESTBED_BUILD_VERSION_PREFIX:
188                              launch_control_build}
189
190            # Suite scheduler handles all boards in parallel, to guarantee each
191            # call of `create_suite_job` use different value of delay_minutes,
192            # we need a lock around get/set attempts of self.delay_minutes.
193            # To prevent suite jobs from running too long, the value for
194            # self.delay_minutes is limited between 0 and MAX_DELAY_MINUTES (4
195            # hours). The value starts at 0 and is increased by
196            # DELAY_MINUTES_INTERVAL, when it reaches MAX_DELAY_MINUTES, the
197            # logic here allows its value to step back by DELAY_MINUTES_INTERVAL
198            # at each call of this method. When the value drops back to 0, it
199            # will increase again in the next call of this method.
200            # Such logic allows the values of delay_minutes for all calls
201            # of `create_suite_job` running in parallel to be evenly distributed
202            # between 0 and MAX_DELAY_MINUTES.
203            delay_minutes = 0
204            if not no_delay:
205                with self._lock:
206                    delay_minutes = self.delay_minutes
207                    if ((self.delay_minutes < MAX_DELAY_MINUTES and
208                         self.delay_minutes_interval > 0) or
209                        (self.delay_minutes >= DELAY_MINUTES_INTERVAL and
210                         self.delay_minutes_interval < 0)):
211                        self.delay_minutes += self.delay_minutes_interval
212                    else:
213                        limit = ('Maximum' if self.delay_minutes_interval > 0
214                                 else 'Minimum')
215                        logging.info(
216                                '%s delay minutes reached when scheduling '
217                                '%s on %s against %s (pool: %s)',
218                                limit, suite, builds, board, pool)
219                        self.delay_minutes_interval = (
220                                -self.delay_minutes_interval)
221
222            # Update timeout settings for the suite job with delay_minutes.
223            # `timeout` is in hours.
224            if not timeout:
225                timeout = JOB_MAX_RUNTIME_MINS_DEFAULT / 60.0
226            timeout += delay_minutes / 60.0
227            max_runtime_mins = JOB_MAX_RUNTIME_MINS_DEFAULT + delay_minutes
228            timeout_mins = JOB_MAX_RUNTIME_MINS_DEFAULT + delay_minutes
229
230            logging.info('Scheduling %s on %s against %s (pool: %s)',
231                         suite, builds, board, pool)
232            if self._afe.run('create_suite_job', name=suite, board=board,
233                             builds=builds, check_hosts=False, num=num,
234                             pool=pool, priority=priority, timeout=timeout,
235                             max_runtime_mins=max_runtime_mins,
236                             timeout_mins=timeout_mins,
237                             file_bugs=file_bugs,
238                             wait_for_results=file_bugs,
239                             test_source_build=test_source_build,
240                             job_retry=job_retry,
241                             delay_minutes=delay_minutes,
242                             run_prod_code=run_prod_code,
243                             min_rpc_timeout=_MIN_RPC_TIMEOUT) is not None:
244                # Report data to metrics.
245                fields = {'suite': suite,
246                          'board': board,
247                          'pool': pool,
248                          'priority': str(priority)}
249                self._SUITE_SCHEDULER_SUITE_COUNT.increment(fields=fields)
250                return True
251            else:
252                raise ScheduleException(
253                        "Can't schedule %s for %s." % (suite, builds))
254        except (error.ControlFileNotFound, error.ControlFileEmpty,
255                error.ControlFileMalformed, error.NoControlFileList) as e:
256            if self._file_bug:
257                # File bug on test_source_build if it's specified.
258                b = reporting.SuiteSchedulerBug(
259                        suite, test_source_build or build, board, e)
260                # If a bug has filed with the same <suite, build, error type>
261                # will not file again, but simply gets the existing bug id.
262                bid, _ = reporting.Reporter().report(
263                        b, ignore_duplicate=True)
264                if bid is not None:
265                    return False
266            # Raise the exception if not filing a bug or failed to file bug.
267            raise ScheduleException(e)
268        except Exception as e:
269            raise ScheduleException(e)
270
271
272    def ScheduleSuite(self, suite, board, build, pool, num, priority, timeout,
273                      force=False, file_bugs=False, firmware_rw_build=None,
274                      firmware_ro_build=None, test_source_build=None,
275                      job_retry=False, launch_control_build=None,
276                      run_prod_code=False, testbed_dut_count=None,
277                      no_delay=False):
278        """Schedule |suite|, if it hasn't already been run.
279
280        If |suite| has not already been run against |build| on |board|,
281        schedule it and return True.  If it has, return False.
282
283        @param suite: the name of the suite to run, e.g. 'bvt'
284        @param board: the board to run the suite on, e.g. x86-alex
285        @param build: the ChromeOS build to install e.g.
286                      x86-alex-release/R18-1655.0.0-a1-b1584.
287        @param pool: the pool of machines to use for scheduling purposes.
288        @param num: the number of devices across which to shard the test suite.
289                    Type: integer or None
290        @param priority: One of the values from
291                         client.common_lib.priorities.Priority.
292        @param timeout: The max lifetime of the suite in hours.
293        @param force: Always schedule the suite.
294        @param file_bugs: True if bug filing is desired for this suite.
295        @param firmware_rw_build: Firmware build to update RW firmware. Default
296                                  to None.
297        @param firmware_ro_build: Firmware build to update RO firmware. Default
298                                  to None.
299        @param test_source_build: Build with the source of tests. Default to
300                                  None to use the ChromeOS build.
301        @param job_retry: Set to True to enable job-level retry. Default is
302                          False.
303        @param launch_control_build: Name of a Launch Control build, e.g.,
304                                     'git_mnc_release/shamu-eng/123'
305        @param run_prod_code: If True, the suite will run the test code that
306                              lives in prod aka the test code currently on the
307                              lab servers. If False, the control files and test
308                              code for this suite run will be retrieved from the
309                              build artifacts. Default is False.
310        @param testbed_dut_count: Number of duts to test when using a testbed.
311        @param no_delay: Set to True to allow suite to be created without
312                configuring delay_minutes. Default is False.
313
314        @return True if the suite got scheduled, False if not
315        @raise DedupException if we can't check for dups.
316        @raise ScheduleException if the suite cannot be scheduled.
317
318        """
319        if (force or self._ShouldScheduleSuite(
320                suite, board,
321                test_source_build or build or launch_control_build)):
322            return self._Schedule(suite, board, build, pool, num, priority,
323                                  timeout, file_bugs=file_bugs,
324                                  firmware_rw_build=firmware_rw_build,
325                                  firmware_ro_build=firmware_ro_build,
326                                  test_source_build=test_source_build,
327                                  job_retry=job_retry,
328                                  launch_control_build=launch_control_build,
329                                  run_prod_code=run_prod_code,
330                                  testbed_dut_count=testbed_dut_count,
331                                  no_delay=no_delay)
332        return False
333
334
335    def CheckHostsExist(self, *args, **kwargs):
336        """Forward a request to check if hosts matching args, kwargs exist."""
337        try:
338            kwargs['min_rpc_timeout'] = _MIN_RPC_TIMEOUT
339            return self._afe.get_hostnames(*args, **kwargs)
340        except error.TimeoutException as e:
341            logging.exception(e)
342            return []
343