1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import datetime 6import logging 7import threading 8 9import common 10from autotest_lib.client.common_lib import error 11from autotest_lib.client.common_lib import global_config 12from autotest_lib.server import site_utils 13from autotest_lib.server.cros import provision 14from autotest_lib.server.cros.dynamic_suite import frontend_wrappers, reporting 15 16try: 17 from chromite.lib import metrics 18except ImportError: 19 metrics = site_utils.metrics_mock 20 21 22CONFIG = global_config.global_config 23 24JOB_MAX_RUNTIME_MINS_DEFAULT = CONFIG.get_config_value( 25 'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int, default=72*60) 26 27# Minimum RPC timeout setting for calls expected to take long time, e.g., 28# create_suite_job. If default socket time (socket.getdefaulttimeout()) is 29# None or greater than this value, the default will be used. 30# The value here is set to be the same as the timeout for the RetryingAFE object 31# so long running RPCs can wait long enough before being aborted. 32_MIN_RPC_TIMEOUT = 600 33 34# Number of days back to search for existing job. 35SEARCH_JOB_MAX_DAYS = 14 36 37# Number of minutes to increase the value of DedupingScheduler.delay_minutes. 38# This allows all suite jobs created in the same event to start provision jobs 39# at different time. 5 minutes allows 40 boards to have provision jobs started 40# with in about 200 minutes. That way, we don't add too much delay on test jobs 41# and do not keep suite jobs running for too long. Note that suite jobs created 42# by suite scheduler does not wait for test job to finish. That helps to reduce 43# the load on drone. 44DELAY_MINUTES_INTERVAL = 5 45# Set maximum delay minutes to 24 hours. This is to prevent suite jobs from 46# running for too long. Nightly and new_build tasks won't create that many 47# suites that need such a long delay. However, weekly tasks can create several 48# hundreds of suites as most of them requires to run on all branches. 49MAX_DELAY_MINUTES = 1440 50 51class DedupingSchedulerException(Exception): 52 """Base class for exceptions from this module.""" 53 pass 54 55 56class ScheduleException(DedupingSchedulerException): 57 """Raised when an error is returned from the AFE during scheduling.""" 58 pass 59 60 61class DedupException(DedupingSchedulerException): 62 """Raised when an error occurs while checking for duplicate jobs.""" 63 pass 64 65 66class DedupingScheduler(object): 67 """A class that will schedule suites to run on a given board, build. 68 69 Includes logic to check whether or not a given (suite, board, build) 70 has already been run. If so, it will skip scheduling that suite. 71 72 @var _afe: a frontend.AFE instance used to talk to autotest. 73 """ 74 75 _SUITE_SCHEDULER_SUITE_COUNT = metrics.Counter( 76 'chromeos/autotest/suite_scheduler/suite/created') 77 78 def __init__(self, afe=None, file_bug=False): 79 """Constructor 80 81 @param afe: an instance of AFE as defined in server/frontend.py. 82 Defaults to a frontend_wrappers.RetryingAFE instance. 83 """ 84 self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30, 85 delay_sec=10, 86 debug=False) 87 self._file_bug = file_bug 88 89 # Number of minutes to delay a suite job from creating test jobs. 90 self.delay_minutes = 0 91 # Number of minutes to increase of decrease self.delay_minutes. When 92 # self.delay_minutes reaches MAX_DELAY_MINUTES, it should wind down 93 # to allow even distribution of test job creation. 94 self.delay_minutes_interval = DELAY_MINUTES_INTERVAL 95 # Lock to make sure each suite created with different delay_minutes. 96 self._lock = threading.Lock() 97 98 99 def _ShouldScheduleSuite(self, suite, board, test_source_build): 100 """Return True if |suite| has not yet been run for |build| on |board|. 101 102 True if |suite| has not been run for |build| on |board|, and 103 the lab is open for this particular request. False otherwise. 104 105 @param suite: the name of the suite to run, e.g. 'bvt' 106 @param board: the board to run the suite on, e.g. x86-alex 107 @param test_source_build: Build with the source of tests. 108 109 @return False if the suite was already scheduled, True if not 110 @raise DedupException if the AFE raises while searching for jobs. 111 112 """ 113 try: 114 site_utils.check_lab_status(test_source_build) 115 except site_utils.TestLabException as ex: 116 logging.debug('Skipping suite %s, board %s, build %s: %s', 117 suite, board, test_source_build, str(ex)) 118 return False 119 try: 120 start_time = str(datetime.datetime.now() - 121 datetime.timedelta(days=SEARCH_JOB_MAX_DAYS)) 122 return not self._afe.get_jobs( 123 name__istartswith=test_source_build, 124 name__iendswith='control.'+suite, 125 created_on__gte=start_time, 126 min_rpc_timeout=_MIN_RPC_TIMEOUT) 127 except Exception as e: 128 raise DedupException(e) 129 130 131 def _Schedule(self, suite, board, build, pool, num, priority, timeout, 132 file_bugs=False, firmware_rw_build=None, 133 firmware_ro_build=None, test_source_build=None, 134 job_retry=False, launch_control_build=None, 135 run_prod_code=False, testbed_dut_count=None, no_delay=False): 136 """Schedule |suite|, if it hasn't already been run. 137 138 @param suite: the name of the suite to run, e.g. 'bvt' 139 @param board: the board to run the suite on, e.g. x86-alex 140 @param build: the ChromeOS build to install e.g. 141 x86-alex-release/R18-1655.0.0-a1-b1584. 142 @param pool: the pool of machines to use for scheduling purposes. 143 Default: None 144 @param num: the number of devices across which to shard the test suite. 145 Type: integer or None 146 Default: None (uses sharding factor in global_config.ini). 147 @param priority: One of the values from 148 client.common_lib.priorities.Priority. 149 @param timeout: The max lifetime of the suite in hours. 150 @param file_bugs: True if bug filing is desired for this suite. 151 @param firmware_rw_build: Firmware build to update RW firmware. Default 152 to None. 153 @param firmware_ro_build: Firmware build to update RO firmware. Default 154 to None. 155 @param test_source_build: Build that contains the server-side test code. 156 Default to None to use the ChromeOS build 157 (defined by `build`). 158 @param job_retry: Set to True to enable job-level retry. Default is 159 False. 160 @param launch_control_build: Name of a Launch Control build, e.g., 161 'git_mnc_release/shamu-eng/123' 162 @param run_prod_code: If True, the suite will run the test code that 163 lives in prod aka the test code currently on the 164 lab servers. If False, the control files and test 165 code for this suite run will be retrieved from the 166 build artifacts. Default is False. 167 @param testbed_dut_count: Number of duts to test when using a testbed. 168 @param no_delay: Set to True to allow suite to be created without 169 configuring delay_minutes. Default is False. 170 171 @return True if the suite got scheduled 172 @raise ScheduleException if an error occurs while scheduling. 173 174 """ 175 try: 176 if build: 177 builds = {provision.CROS_VERSION_PREFIX: build} 178 if firmware_rw_build: 179 builds[provision.FW_RW_VERSION_PREFIX] = firmware_rw_build 180 if firmware_ro_build: 181 builds[provision.FW_RO_VERSION_PREFIX] = firmware_ro_build 182 if launch_control_build: 183 if testbed_dut_count is None: 184 builds = {provision.ANDROID_BUILD_VERSION_PREFIX: 185 launch_control_build} 186 else: 187 builds = {provision.TESTBED_BUILD_VERSION_PREFIX: 188 launch_control_build} 189 190 # Suite scheduler handles all boards in parallel, to guarantee each 191 # call of `create_suite_job` use different value of delay_minutes, 192 # we need a lock around get/set attempts of self.delay_minutes. 193 # To prevent suite jobs from running too long, the value for 194 # self.delay_minutes is limited between 0 and MAX_DELAY_MINUTES (4 195 # hours). The value starts at 0 and is increased by 196 # DELAY_MINUTES_INTERVAL, when it reaches MAX_DELAY_MINUTES, the 197 # logic here allows its value to step back by DELAY_MINUTES_INTERVAL 198 # at each call of this method. When the value drops back to 0, it 199 # will increase again in the next call of this method. 200 # Such logic allows the values of delay_minutes for all calls 201 # of `create_suite_job` running in parallel to be evenly distributed 202 # between 0 and MAX_DELAY_MINUTES. 203 delay_minutes = 0 204 if not no_delay: 205 with self._lock: 206 delay_minutes = self.delay_minutes 207 if ((self.delay_minutes < MAX_DELAY_MINUTES and 208 self.delay_minutes_interval > 0) or 209 (self.delay_minutes >= DELAY_MINUTES_INTERVAL and 210 self.delay_minutes_interval < 0)): 211 self.delay_minutes += self.delay_minutes_interval 212 else: 213 limit = ('Maximum' if self.delay_minutes_interval > 0 214 else 'Minimum') 215 logging.info( 216 '%s delay minutes reached when scheduling ' 217 '%s on %s against %s (pool: %s)', 218 limit, suite, builds, board, pool) 219 self.delay_minutes_interval = ( 220 -self.delay_minutes_interval) 221 222 # Update timeout settings for the suite job with delay_minutes. 223 # `timeout` is in hours. 224 if not timeout: 225 timeout = JOB_MAX_RUNTIME_MINS_DEFAULT / 60.0 226 timeout += delay_minutes / 60.0 227 max_runtime_mins = JOB_MAX_RUNTIME_MINS_DEFAULT + delay_minutes 228 timeout_mins = JOB_MAX_RUNTIME_MINS_DEFAULT + delay_minutes 229 230 logging.info('Scheduling %s on %s against %s (pool: %s)...', 231 suite, builds, board, pool) 232 job_id = self._afe.run('create_suite_job', name=suite, board=board, 233 builds=builds, check_hosts=False, num=num, 234 pool=pool, priority=priority, 235 timeout=timeout, 236 max_runtime_mins=max_runtime_mins, 237 timeout_mins=timeout_mins, 238 file_bugs=file_bugs, 239 wait_for_results=file_bugs, 240 test_source_build=test_source_build, 241 job_retry=job_retry, 242 delay_minutes=delay_minutes, 243 run_prod_code=run_prod_code, 244 min_rpc_timeout=_MIN_RPC_TIMEOUT) 245 if job_id is not None: 246 logging.info('... created as suite job id %s', job_id) 247 # Report data to metrics. 248 fields = {'suite': suite, 249 'board': board, 250 'pool': pool, 251 'priority': str(priority)} 252 self._SUITE_SCHEDULER_SUITE_COUNT.increment(fields=fields) 253 return True 254 else: 255 raise ScheduleException( 256 "Can't schedule %s for %s." % (suite, builds)) 257 except (error.ControlFileNotFound, error.ControlFileEmpty, 258 error.ControlFileMalformed, error.NoControlFileList) as e: 259 if self._file_bug: 260 # File bug on test_source_build if it's specified. 261 b = reporting.SuiteSchedulerBug( 262 suite, test_source_build or build, board, e) 263 # If a bug has filed with the same <suite, build, error type> 264 # will not file again, but simply gets the existing bug id. 265 bid, _ = reporting.Reporter().report( 266 b, ignore_duplicate=True) 267 if bid is not None: 268 return False 269 # Raise the exception if not filing a bug or failed to file bug. 270 raise ScheduleException(e) 271 except Exception as e: 272 raise ScheduleException(e) 273 274 275 def ScheduleSuite(self, suite, board, build, pool, num, priority, timeout, 276 force=False, file_bugs=False, firmware_rw_build=None, 277 firmware_ro_build=None, test_source_build=None, 278 job_retry=False, launch_control_build=None, 279 run_prod_code=False, testbed_dut_count=None, 280 no_delay=False): 281 """Schedule |suite|, if it hasn't already been run. 282 283 If |suite| has not already been run against |build| on |board|, 284 schedule it and return True. If it has, return False. 285 286 @param suite: the name of the suite to run, e.g. 'bvt' 287 @param board: the board to run the suite on, e.g. x86-alex 288 @param build: the ChromeOS build to install e.g. 289 x86-alex-release/R18-1655.0.0-a1-b1584. 290 @param pool: the pool of machines to use for scheduling purposes. 291 @param num: the number of devices across which to shard the test suite. 292 Type: integer or None 293 @param priority: One of the values from 294 client.common_lib.priorities.Priority. 295 @param timeout: The max lifetime of the suite in hours. 296 @param force: Always schedule the suite. 297 @param file_bugs: True if bug filing is desired for this suite. 298 @param firmware_rw_build: Firmware build to update RW firmware. Default 299 to None. 300 @param firmware_ro_build: Firmware build to update RO firmware. Default 301 to None. 302 @param test_source_build: Build with the source of tests. Default to 303 None to use the ChromeOS build. 304 @param job_retry: Set to True to enable job-level retry. Default is 305 False. 306 @param launch_control_build: Name of a Launch Control build, e.g., 307 'git_mnc_release/shamu-eng/123' 308 @param run_prod_code: If True, the suite will run the test code that 309 lives in prod aka the test code currently on the 310 lab servers. If False, the control files and test 311 code for this suite run will be retrieved from the 312 build artifacts. Default is False. 313 @param testbed_dut_count: Number of duts to test when using a testbed. 314 @param no_delay: Set to True to allow suite to be created without 315 configuring delay_minutes. Default is False. 316 317 @return True if the suite got scheduled, False if not 318 @raise DedupException if we can't check for dups. 319 @raise ScheduleException if the suite cannot be scheduled. 320 321 """ 322 if (force or self._ShouldScheduleSuite( 323 suite, board, 324 test_source_build or build or launch_control_build)): 325 return self._Schedule(suite, board, build, pool, num, priority, 326 timeout, file_bugs=file_bugs, 327 firmware_rw_build=firmware_rw_build, 328 firmware_ro_build=firmware_ro_build, 329 test_source_build=test_source_build, 330 job_retry=job_retry, 331 launch_control_build=launch_control_build, 332 run_prod_code=run_prod_code, 333 testbed_dut_count=testbed_dut_count, 334 no_delay=no_delay) 335 return False 336 337 338 def CheckHostsExist(self, *args, **kwargs): 339 """Forward a request to check if hosts matching args, kwargs exist.""" 340 try: 341 kwargs['min_rpc_timeout'] = _MIN_RPC_TIMEOUT 342 return self._afe.get_hostnames(*args, **kwargs) 343 except error.TimeoutException as e: 344 logging.exception(e) 345 return [] 346