• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import datetime
6import logging
7import threading
8
9import common
10from autotest_lib.client.common_lib import error
11from autotest_lib.client.common_lib import global_config
12from autotest_lib.server import site_utils
13from autotest_lib.server.cros import provision
14from autotest_lib.server.cros.dynamic_suite import frontend_wrappers, reporting
15
16try:
17    from chromite.lib import metrics
18except ImportError:
19    metrics = site_utils.metrics_mock
20
21
22CONFIG = global_config.global_config
23
24JOB_MAX_RUNTIME_MINS_DEFAULT = CONFIG.get_config_value(
25        'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int, default=72*60)
26
27# Minimum RPC timeout setting for calls expected to take long time, e.g.,
28# create_suite_job. If default socket time (socket.getdefaulttimeout()) is
29# None or greater than this value, the default will be used.
30# The value here is set to be the same as the timeout for the RetryingAFE object
31# so long running RPCs can wait long enough before being aborted.
32_MIN_RPC_TIMEOUT = 600
33
34# Number of days back to search for existing job.
35SEARCH_JOB_MAX_DAYS = 14
36
37# Number of minutes to increase the value of DedupingScheduler.delay_minutes.
38# This allows all suite jobs created in the same event to start provision jobs
39# at different time. 5 minutes allows 40 boards to have provision jobs started
40# with in about 200 minutes. That way, we don't add too much delay on test jobs
41# and do not keep suite jobs running for too long. Note that suite jobs created
42# by suite scheduler does not wait for test job to finish. That helps to reduce
43# the load on drone.
44DELAY_MINUTES_INTERVAL = 5
45# Set maximum delay minutes to 24 hours. This is to prevent suite jobs from
46# running for too long. Nightly and new_build tasks won't create that many
47# suites that need such a long delay. However, weekly tasks can create several
48# hundreds of suites as most of them requires to run on all branches.
49MAX_DELAY_MINUTES = 1440
50
51class DedupingSchedulerException(Exception):
52    """Base class for exceptions from this module."""
53    pass
54
55
56class ScheduleException(DedupingSchedulerException):
57    """Raised when an error is returned from the AFE during scheduling."""
58    pass
59
60
61class DedupException(DedupingSchedulerException):
62    """Raised when an error occurs while checking for duplicate jobs."""
63    pass
64
65
66class DedupingScheduler(object):
67    """A class that will schedule suites to run on a given board, build.
68
69    Includes logic to check whether or not a given (suite, board, build)
70    has already been run.  If so, it will skip scheduling that suite.
71
72    @var _afe: a frontend.AFE instance used to talk to autotest.
73    """
74
75    _SUITE_SCHEDULER_SUITE_COUNT = metrics.Counter(
76            'chromeos/autotest/suite_scheduler/suite/created')
77
78    def __init__(self, afe=None, file_bug=False):
79        """Constructor
80
81        @param afe: an instance of AFE as defined in server/frontend.py.
82                    Defaults to a frontend_wrappers.RetryingAFE instance.
83        """
84        self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
85                                                         delay_sec=10,
86                                                         debug=False)
87        self._file_bug = file_bug
88
89        # Number of minutes to delay a suite job from creating test jobs.
90        self.delay_minutes = 0
91        # Number of minutes to increase of decrease self.delay_minutes. When
92        # self.delay_minutes reaches MAX_DELAY_MINUTES, it should wind down
93        # to allow even distribution of test job creation.
94        self.delay_minutes_interval = DELAY_MINUTES_INTERVAL
95        # Lock to make sure each suite created with different delay_minutes.
96        self._lock = threading.Lock()
97
98
99    def _ShouldScheduleSuite(self, suite, board, test_source_build):
100        """Return True if |suite| has not yet been run for |build| on |board|.
101
102        True if |suite| has not been run for |build| on |board|, and
103        the lab is open for this particular request.  False otherwise.
104
105        @param suite: the name of the suite to run, e.g. 'bvt'
106        @param board: the board to run the suite on, e.g. x86-alex
107        @param test_source_build: Build with the source of tests.
108
109        @return False if the suite was already scheduled, True if not
110        @raise DedupException if the AFE raises while searching for jobs.
111
112        """
113        try:
114            site_utils.check_lab_status(test_source_build)
115        except site_utils.TestLabException as ex:
116            logging.debug('Skipping suite %s, board %s, build %s:  %s',
117                          suite, board, test_source_build, str(ex))
118            return False
119        try:
120            start_time = str(datetime.datetime.now() -
121                             datetime.timedelta(days=SEARCH_JOB_MAX_DAYS))
122            return not self._afe.get_jobs(
123                    name__istartswith=test_source_build,
124                    name__iendswith='control.'+suite,
125                    created_on__gte=start_time,
126                    min_rpc_timeout=_MIN_RPC_TIMEOUT)
127        except Exception as e:
128            raise DedupException(e)
129
130
131    def _Schedule(self, suite, board, build, pool, num, priority, timeout,
132                  file_bugs=False, firmware_rw_build=None,
133                  firmware_ro_build=None, test_source_build=None,
134                  job_retry=False, launch_control_build=None,
135                  run_prod_code=False, testbed_dut_count=None, no_delay=False):
136        """Schedule |suite|, if it hasn't already been run.
137
138        @param suite: the name of the suite to run, e.g. 'bvt'
139        @param board: the board to run the suite on, e.g. x86-alex
140        @param build: the ChromeOS build to install e.g.
141                      x86-alex-release/R18-1655.0.0-a1-b1584.
142        @param pool: the pool of machines to use for scheduling purposes.
143                     Default: None
144        @param num: the number of devices across which to shard the test suite.
145                    Type: integer or None
146                    Default: None (uses sharding factor in global_config.ini).
147        @param priority: One of the values from
148                         client.common_lib.priorities.Priority.
149        @param timeout: The max lifetime of the suite in hours.
150        @param file_bugs: True if bug filing is desired for this suite.
151        @param firmware_rw_build: Firmware build to update RW firmware. Default
152                                  to None.
153        @param firmware_ro_build: Firmware build to update RO firmware. Default
154                                  to None.
155        @param test_source_build: Build that contains the server-side test code.
156                                  Default to None to use the ChromeOS build
157                                  (defined by `build`).
158        @param job_retry: Set to True to enable job-level retry. Default is
159                          False.
160        @param launch_control_build: Name of a Launch Control build, e.g.,
161                                     'git_mnc_release/shamu-eng/123'
162        @param run_prod_code: If True, the suite will run the test code that
163                              lives in prod aka the test code currently on the
164                              lab servers. If False, the control files and test
165                              code for this suite run will be retrieved from the
166                              build artifacts. Default is False.
167        @param testbed_dut_count: Number of duts to test when using a testbed.
168        @param no_delay: Set to True to allow suite to be created without
169                         configuring delay_minutes. Default is False.
170
171        @return True if the suite got scheduled
172        @raise ScheduleException if an error occurs while scheduling.
173
174        """
175        try:
176            if build:
177                builds = {provision.CROS_VERSION_PREFIX: build}
178            if firmware_rw_build:
179                builds[provision.FW_RW_VERSION_PREFIX] = firmware_rw_build
180            if firmware_ro_build:
181                builds[provision.FW_RO_VERSION_PREFIX] = firmware_ro_build
182            if launch_control_build:
183                if testbed_dut_count is None:
184                    builds = {provision.ANDROID_BUILD_VERSION_PREFIX:
185                              launch_control_build}
186                else:
187                    builds = {provision.TESTBED_BUILD_VERSION_PREFIX:
188                              launch_control_build}
189
190            # Suite scheduler handles all boards in parallel, to guarantee each
191            # call of `create_suite_job` use different value of delay_minutes,
192            # we need a lock around get/set attempts of self.delay_minutes.
193            # To prevent suite jobs from running too long, the value for
194            # self.delay_minutes is limited between 0 and MAX_DELAY_MINUTES (4
195            # hours). The value starts at 0 and is increased by
196            # DELAY_MINUTES_INTERVAL, when it reaches MAX_DELAY_MINUTES, the
197            # logic here allows its value to step back by DELAY_MINUTES_INTERVAL
198            # at each call of this method. When the value drops back to 0, it
199            # will increase again in the next call of this method.
200            # Such logic allows the values of delay_minutes for all calls
201            # of `create_suite_job` running in parallel to be evenly distributed
202            # between 0 and MAX_DELAY_MINUTES.
203            delay_minutes = 0
204            if not no_delay:
205                with self._lock:
206                    delay_minutes = self.delay_minutes
207                    if ((self.delay_minutes < MAX_DELAY_MINUTES and
208                         self.delay_minutes_interval > 0) or
209                        (self.delay_minutes >= DELAY_MINUTES_INTERVAL and
210                         self.delay_minutes_interval < 0)):
211                        self.delay_minutes += self.delay_minutes_interval
212                    else:
213                        limit = ('Maximum' if self.delay_minutes_interval > 0
214                                 else 'Minimum')
215                        logging.info(
216                                '%s delay minutes reached when scheduling '
217                                '%s on %s against %s (pool: %s)',
218                                limit, suite, builds, board, pool)
219                        self.delay_minutes_interval = (
220                                -self.delay_minutes_interval)
221
222            # Update timeout settings for the suite job with delay_minutes.
223            # `timeout` is in hours.
224            if not timeout:
225                timeout = JOB_MAX_RUNTIME_MINS_DEFAULT / 60.0
226            timeout += delay_minutes / 60.0
227            max_runtime_mins = JOB_MAX_RUNTIME_MINS_DEFAULT + delay_minutes
228            timeout_mins = JOB_MAX_RUNTIME_MINS_DEFAULT + delay_minutes
229
230            logging.info('Scheduling %s on %s against %s (pool: %s)...',
231                         suite, builds, board, pool)
232            job_id = self._afe.run('create_suite_job', name=suite, board=board,
233                                   builds=builds, check_hosts=False, num=num,
234                                   pool=pool, priority=priority,
235                                   timeout=timeout,
236                                   max_runtime_mins=max_runtime_mins,
237                                   timeout_mins=timeout_mins,
238                                   file_bugs=file_bugs,
239                                   wait_for_results=file_bugs,
240                                   test_source_build=test_source_build,
241                                   job_retry=job_retry,
242                                   delay_minutes=delay_minutes,
243                                   run_prod_code=run_prod_code,
244                                   min_rpc_timeout=_MIN_RPC_TIMEOUT)
245            if job_id is not None:
246                logging.info('... created as suite job id %s', job_id)
247                # Report data to metrics.
248                fields = {'suite': suite,
249                          'board': board,
250                          'pool': pool,
251                          'priority': str(priority)}
252                self._SUITE_SCHEDULER_SUITE_COUNT.increment(fields=fields)
253                return True
254            else:
255                raise ScheduleException(
256                        "Can't schedule %s for %s." % (suite, builds))
257        except (error.ControlFileNotFound, error.ControlFileEmpty,
258                error.ControlFileMalformed, error.NoControlFileList) as e:
259            if self._file_bug:
260                # File bug on test_source_build if it's specified.
261                b = reporting.SuiteSchedulerBug(
262                        suite, test_source_build or build, board, e)
263                # If a bug has filed with the same <suite, build, error type>
264                # will not file again, but simply gets the existing bug id.
265                bid, _ = reporting.Reporter().report(
266                        b, ignore_duplicate=True)
267                if bid is not None:
268                    return False
269            # Raise the exception if not filing a bug or failed to file bug.
270            raise ScheduleException(e)
271        except Exception as e:
272            raise ScheduleException(e)
273
274
275    def ScheduleSuite(self, suite, board, build, pool, num, priority, timeout,
276                      force=False, file_bugs=False, firmware_rw_build=None,
277                      firmware_ro_build=None, test_source_build=None,
278                      job_retry=False, launch_control_build=None,
279                      run_prod_code=False, testbed_dut_count=None,
280                      no_delay=False):
281        """Schedule |suite|, if it hasn't already been run.
282
283        If |suite| has not already been run against |build| on |board|,
284        schedule it and return True.  If it has, return False.
285
286        @param suite: the name of the suite to run, e.g. 'bvt'
287        @param board: the board to run the suite on, e.g. x86-alex
288        @param build: the ChromeOS build to install e.g.
289                      x86-alex-release/R18-1655.0.0-a1-b1584.
290        @param pool: the pool of machines to use for scheduling purposes.
291        @param num: the number of devices across which to shard the test suite.
292                    Type: integer or None
293        @param priority: One of the values from
294                         client.common_lib.priorities.Priority.
295        @param timeout: The max lifetime of the suite in hours.
296        @param force: Always schedule the suite.
297        @param file_bugs: True if bug filing is desired for this suite.
298        @param firmware_rw_build: Firmware build to update RW firmware. Default
299                                  to None.
300        @param firmware_ro_build: Firmware build to update RO firmware. Default
301                                  to None.
302        @param test_source_build: Build with the source of tests. Default to
303                                  None to use the ChromeOS build.
304        @param job_retry: Set to True to enable job-level retry. Default is
305                          False.
306        @param launch_control_build: Name of a Launch Control build, e.g.,
307                                     'git_mnc_release/shamu-eng/123'
308        @param run_prod_code: If True, the suite will run the test code that
309                              lives in prod aka the test code currently on the
310                              lab servers. If False, the control files and test
311                              code for this suite run will be retrieved from the
312                              build artifacts. Default is False.
313        @param testbed_dut_count: Number of duts to test when using a testbed.
314        @param no_delay: Set to True to allow suite to be created without
315                configuring delay_minutes. Default is False.
316
317        @return True if the suite got scheduled, False if not
318        @raise DedupException if we can't check for dups.
319        @raise ScheduleException if the suite cannot be scheduled.
320
321        """
322        if (force or self._ShouldScheduleSuite(
323                suite, board,
324                test_source_build or build or launch_control_build)):
325            return self._Schedule(suite, board, build, pool, num, priority,
326                                  timeout, file_bugs=file_bugs,
327                                  firmware_rw_build=firmware_rw_build,
328                                  firmware_ro_build=firmware_ro_build,
329                                  test_source_build=test_source_build,
330                                  job_retry=job_retry,
331                                  launch_control_build=launch_control_build,
332                                  run_prod_code=run_prod_code,
333                                  testbed_dut_count=testbed_dut_count,
334                                  no_delay=no_delay)
335        return False
336
337
338    def CheckHostsExist(self, *args, **kwargs):
339        """Forward a request to check if hosts matching args, kwargs exist."""
340        try:
341            kwargs['min_rpc_timeout'] = _MIN_RPC_TIMEOUT
342            return self._afe.get_hostnames(*args, **kwargs)
343        except error.TimeoutException as e:
344            logging.exception(e)
345            return []
346