1# Lint as: python2, python3 2# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import collections 7import six.moves.http_client 8import logging 9import os 10import re 11import socket 12import time 13 14import common 15 16from autotest_lib.client.common_lib import error 17from autotest_lib.client.common_lib.cros import retry 18from autotest_lib.server import utils 19 20 21GOOFY_JSONRPC_SERVER_PORT = 0x0FAC 22GOOFY_RUNNING = 'RUNNING' 23 24 25class GoofyProxyException(Exception): 26 """Exception raised when a goofy rpc fails.""" 27 pass 28 29 30class GoofyRuntimeException(Exception): 31 """Exception raised when something goes wrong while a test is running.""" 32 pass 33 34 35def retry_goofy_rpc(exception_tuple, timeout_min=30): 36 """A decorator to use with goofy rpcs. 37 38 This decorator tries to recreate the goofy client proxy on 39 socket error. It will continue trying to do so until it 40 executes the method without any socket errors or till the 41 retry.retry decorator hits it's timeout. 42 43 Usage: 44 If you just want to recreate the proxy: 45 1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error), 46 timeout_min=<timeout>) 47 2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>) 48 Note: you need to specify the socket.error exception because we 49 want to retry the call after recreating the proxy. 50 51 @param exception_tuple: A tuple of exceptions to pass to 52 the retry decorator. Any of these exceptions will result 53 in retries for the duration of timeout_min. 54 @param timeout_min: The timeout, in minutes, for which we should 55 retry the method ignoring any exception in exception_tuple. 56 """ 57 def inner_decorator(method): 58 """Inner retry decorator applied to the method. 59 60 @param method: The method that needs to be wrapped in the decorator. 61 62 @return A wrapper function that implements the retry. 63 """ 64 65 @retry.retry(exception_tuple, timeout_min=timeout_min) 66 def wrapper(*args, **kwargs): 67 """This wrapper handles socket errors. 68 69 If the method in question: 70 1. Throws an exception in exception_tuple and it is not a 71 socket.error, retry for timeout_min through retry.retry. 72 2. Throws a socket.error, recreate the client proxy, and 73 retry for timeout_min through retry.retry. 74 3. Throws an exception not in exception_tuple, fail. 75 """ 76 try: 77 return method(*args, **kwargs) 78 except socket.error as e: 79 goofy_proxy = args[0] 80 if type(goofy_proxy) is GoofyProxy: 81 logging.warning('Socket error while running factory tests ' 82 '%s, recreating goofy proxy.', e) 83 goofy_proxy._create_client_proxy(timeout_min=timeout_min) 84 else: 85 logging.warning('Connectivity was lost and the retry ' 86 'decorator was unable to recreate a goofy ' 87 'client proxy, args: %s.', args) 88 raise 89 90 return wrapper 91 92 return inner_decorator 93 94 95class GoofyProxy(object): 96 """Client capable of making rpc calls to goofy. 97 98 Methods of this class that can cause goofy to change state 99 usually need a retry decorator. Methods that have a retry decorator 100 need to be 'pure', i.e return the same results when called multiple 101 times with the same argument. 102 103 There are 2 known exceptions this class can deal with, a socket.error 104 which happens when we try to execute an rpc when the DUT is, say, suspended 105 and a BadStatusLine, which we get when we try to execute an rpc while the 106 DUT is going through a factory_restart. Ideally we would like to handle 107 socket timeouts different from BadStatusLines as we can get connection 108 errors even when a device reboots and BadStatusLines ususally only when 109 factory restarts. crbug.com/281714. 110 """ 111 112 # This timeout was arbitrarily chosen as many tests in the factory test 113 # suite run for days. Ideally we would like to split this into at least 2 114 # timeouts, one which we use for rpcs that run while no other test is, 115 # running and is smaller than the second that is designed for use with rpcs 116 # that might execute simultaneously with a test. The latter needs a longer 117 # timeout since tests could suspend,resume for a long time, and a call like 118 # GetGoofyStatus should be tolerant to these suspend/resumes. In designing 119 # the base timeout one needs to allocate time to component methods of this 120 # class (such as _set_test_list) as a multiple of the number of rpcs it 121 # executes. 122 BASE_RPC_TIMEOUT = 1440 123 POLLING_INTERVAL = 5 124 FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*' 125 UNTAR_COMMAND = 'tar jxf %s -C %s' 126 127 128 def __init__(self, host): 129 """ 130 @param host: The host object representing the DUT running goofy. 131 """ 132 self._host = host 133 self._raw_stop_running_tests() 134 self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT) 135 136 137 def _create_client_proxy(self, timeout_min=30): 138 """Create a goofy client proxy. 139 140 Ping the host till it's up, then proceed to create a goofy proxy. We 141 don't wrap this method with a retry because it's used in the retry 142 decorator itself. 143 """ 144 145 # We don't ssh ping here as there is a potential dealy in O(minutes) 146 # with our ssh command against a sleeping DUT, once it wakes up, and 147 # that will lead to significant overhead incurred over many reboots. 148 self._host.ping_wait_up(timeout_min) 149 logging.info('Host is pingable, creating goofy client proxy') 150 self._client = self._host.rpc_server_tracker.jsonrpc_connect( 151 GOOFY_JSONRPC_SERVER_PORT) 152 153 154 @retry.retry((six.moves.http_client.BadStatusLine, socket.error), 155 timeout_min=BASE_RPC_TIMEOUT) 156 def _raw_stop_running_tests(self): 157 """Stop running tests by shelling out to the DUT. 158 159 Use this method only before we have actually created the client 160 proxy, as shelling out has several pitfalls. We need to stop all 161 tests in a retry loop because tests will start executing as soon 162 as we have reimaged a DUT and trying to create the proxy while 163 the DUT is rebooting will lead to a spurious failure. 164 165 Note that we use the plain retry decorator for this method since 166 we don't need to recreate the client proxy on failure. 167 """ 168 logging.info('Stopping all tests and clearing factory state') 169 self._host.run('factory clear') 170 171 172 @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error), 173 timeout_min=BASE_RPC_TIMEOUT) 174 def _get_goofy_status(self): 175 """Return status of goofy, ignoring socket timeouts and http exceptions. 176 """ 177 status = self._client.GetGoofyStatus().get('status') 178 return status 179 180 181 def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2): 182 """Wait till goofy is running or a timeout occurs. 183 184 @param timeout_min: Minutes to wait before timing this call out. 185 """ 186 current_time = time.time() 187 timeout_secs = timeout_min * 60 188 logging.info('Waiting on goofy') 189 while self._get_goofy_status() != GOOFY_RUNNING: 190 if time.time() - current_time > timeout_secs: 191 break 192 return 193 194 195 @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2) 196 def _set_test_list(self, next_list): 197 """Set the given test list for execution and turn on test automation. 198 199 Confirm that the given test list is a test that has been baked into 200 the image, then run it. Some test lists are configured to start 201 execution automatically when we call SetTestList, while others wait 202 for a corresponding RunTest. 203 204 @param next_list: The name of the test list. 205 206 @raise jsonrpclib.ProtocolError: If the test list we're trying to switch 207 to isn't on the DUT. 208 """ 209 210 # We can get a BadStatus line on 2 occassions: 211 # 1. As part of SwitchTestList goofy performs a factory restart, which 212 # will throw a BadStatusLine because the rpc can't exit cleanly. We 213 # don't want to retry on this exception, since we've already set the 214 # right test list. 215 # 2. If we try to set a test list while goofy is already down 216 # (from a previous factory restart). In this case we wouldn't have 217 # set the new test list, because we coulnd't connect to goofy. 218 # To properly set a new test list it's important to wait till goofy is 219 # up before attempting to set the test list, while being aware that the 220 # preceding httplib error is from the rpc we just executed leading to 221 # a factory restart. Also note that if the test list is not already on 222 # the DUT this method will fail, emitting the possible test lists one 223 # can switch to. 224 self._wait_for_goofy() 225 logging.info('Switching to test list %s', next_list) 226 try: 227 # Enable full factory test automation. Full test automation mode 228 # skips all manual tests and test barriers, which is what we want in 229 # the test lab. There are other automation modes: partial and none. 230 # In partial automation mode manual tests and barrier are enabled 231 # and user intervention is required; none disables automation. 232 self._client.SwitchTestList(next_list, 'full') 233 except six.moves.http_client.BadStatusLine: 234 logging.info('Switched to list %s, goofy restarting', next_list) 235 236 237 @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error), 238 timeout_min=BASE_RPC_TIMEOUT*2) 239 def _stop_running_tests(self): 240 """Stop all running tests. 241 242 Wrap the StopTest rpc so we can attempt to stop tests even while a DUT 243 is suspended or rebooting. 244 """ 245 logging.info('Stopping tests.') 246 self._client.StopTest() 247 248 249 def _get_test_map(self): 250 """Get a mapping of test suites -> tests. 251 252 Ignore entries for tests that don't have a path. 253 254 @return: A dictionary of the form 255 {'suite_name': ['suite_name.path_to_test', ...]}. 256 """ 257 test_all = set([test['path'] for test in self._client.GetTests() 258 if test.get('path')]) 259 260 test_map = collections.defaultdict(list) 261 for names in test_all: 262 test_map[names.split('.')[0]].append(names) 263 return test_map 264 265 266 def _log_test_results(self, test_status, current_suite): 267 """Format test status results and write them to status.log. 268 269 @param test_status: The status dictionary of a single test. 270 @param current_suite: The current suite name. 271 """ 272 try: 273 self._host.job.record('INFO', None, None, 274 'suite %s, test %s, status: %s' % 275 (current_suite, test_status.get('path'), 276 test_status.get('status'))) 277 except AttributeError as e: 278 logging.error('Could not gather results for current test: %s', e) 279 280 281 @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error), 282 timeout_min=BASE_RPC_TIMEOUT*2) 283 def _get_test_info(self, test_name): 284 """Get the status of one test. 285 286 @param test_name: The name of the test we need the status of. 287 288 @return: The entry for the test in the status dictionary. 289 """ 290 for test in self._client.GetTests(): 291 if test['path'] == test_name: 292 return test 293 raise ValueError('Could not find test_name %s in _get_test_info.' % 294 test_name) 295 296 297 @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error), 298 timeout_min=BASE_RPC_TIMEOUT*2) 299 def _get_test_run_info(self, run_id): 300 """Get the information about the given test run. 301 302 @param run_id: The ID of the test run. 303 304 @return: A dict of test run status. 305 """ 306 return self._client.GetTestRunStatus(run_id) 307 308 309 def _wait_on_run(self, run_id): 310 """Wait until the given test run to end. 311 312 @param run_id: The ID of the test run. 313 314 @raises GoofyRuntimeException: If the test run does not finish 315 gracefully. 316 """ 317 finished_tests = set() 318 run_info = self._get_test_run_info(run_id) 319 while run_info['status'] == 'RUNNING': 320 finished = [(t['path'], t['status']) for t in 321 run_info['scheduled_tests'] 322 if t['status'] in ('PASSED', 'FAILED')] 323 for t in finished: 324 if t not in finished_tests: 325 logging.info('[%s] %s', t[1], t[0]) 326 finished_tests.add(t) 327 time.sleep(self.POLLING_INTERVAL) 328 run_info = self._get_test_run_info(run_id) 329 if run_info['status'] != 'FINISHED': 330 raise GoofyRuntimeException( 331 'The requested test run was interrupted.') 332 333 334 def _synchronous_run_suite(self, suite_name): 335 """Run one suite and wait for it to finish. 336 337 Will start a test run for the specified suite_name and wait until it 338 ends. 339 340 @param suite_name: The name of the suite to wait for. 341 342 @raises GoofyProxyException: If the status of the suite 343 doesn't switch to active after we call RunTest. 344 345 @return: The result of the suite. 346 """ 347 logging.info('Starting suite: %s', suite_name) 348 run_id = self._client.RunTest(suite_name) 349 self._wait_on_run(run_id) 350 return self._get_test_run_info(run_id) 351 352 353 def monitor_tests(self, test_list): 354 """Run a test list. 355 356 Will run each suite in the given list in sequence, starting each one 357 by name and waiting on its results. This method makes the following 358 assumptions: 359 - A test list is made up of self contained suites. 360 - These suites trigger several things in parallel. 361 - After a suite finishes it leaves goofy in an idle state. 362 363 It is not safe to pull results for individual tests during the suite 364 as the device could be rebooting, or goofy could be under stress. 365 Instead, this method synchronously waits on an entire suite, then 366 asks goofy for the status of each test in the suite. Since certain 367 test lists automatically start and others don't, this method stops 368 test list execution regardless, and sequentially triggers each suite. 369 370 @param test_list: The test list to run. 371 """ 372 self._set_test_list(test_list) 373 self._wait_for_goofy() 374 self._stop_running_tests() 375 376 test_map = self._get_test_map() 377 if test_map: 378 logging.info('About to execute tests: %s', test_map) 379 else: 380 raise GoofyRuntimeException('Test map is empty, you might have an ' 381 'error in your test_list.') 382 383 384 for current_suite in test_map.keys(): 385 logging.info('Processing suite %s', current_suite) 386 387 result = self._synchronous_run_suite(current_suite) 388 logging.info(result) 389 390 for test_names in test_map.get(current_suite): 391 self._log_test_results(self._get_test_info(test_names), 392 current_suite) 393 394 395 @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error), 396 timeout_min=BASE_RPC_TIMEOUT*2) 397 def get_results(self, resultsdir): 398 """Copies results from the DUT to a local results directory. 399 400 Copy the tarball over to the results folder, untar, and delete the 401 tarball if everything was successful. This will effectively place 402 all the logs relevant to factory testing in the job's results folder. 403 404 @param resultsdir: The directory in which to untar the contents of the 405 tarball factory_bug generates. 406 """ 407 logging.info('Getting results logs for test_list.') 408 409 try: 410 factory_bug_log = self._host.run('factory_bug').stderr 411 except error.CmdError as e: 412 logging.error('Could not execute factory_bug: %s', e) 413 return 414 415 try: 416 factory_bug_tar = re.match(self.FACTORY_BUG_RE, 417 factory_bug_log).groups(1)[0] 418 except (IndexError, AttributeError): 419 logging.error('could not collect logs for factory results, ' 420 'factory bug returned %s', factory_bug_log) 421 return 422 423 factory_bug_tar_file = os.path.basename(factory_bug_tar) 424 local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file) 425 426 try: 427 self._host.get_file(factory_bug_tar, local_factory_bug_tar) 428 except error.AutoservRunError as e: 429 logging.error('Failed to pull back the results tarball: %s', e) 430 return 431 432 try: 433 utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir)) 434 except error.CmdError as e: 435 logging.error('Failed to untar the results tarball: %s', e) 436 return 437 finally: 438 if os.path.exists(local_factory_bug_tar): 439 os.remove(local_factory_bug_tar) 440