• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import collections
6import httplib
7import logging
8import os
9import re
10import socket
11import time
12
13import common
14
15from autotest_lib.client.common_lib import error
16from autotest_lib.client.common_lib.cros import retry
17from autotest_lib.server import utils
18
19
20GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
21GOOFY_RUNNING = 'RUNNING'
22
23
24class GoofyProxyException(Exception):
25    """Exception raised when a goofy rpc fails."""
26    pass
27
28
29class GoofyRuntimeException(Exception):
30    """Exception raised when something goes wrong while a test is running."""
31    pass
32
33
34def retry_goofy_rpc(exception_tuple, timeout_min=30):
35    """A decorator to use with goofy rpcs.
36
37    This decorator tries to recreate the goofy client proxy on
38    socket error. It will continue trying to do so until it
39    executes the method without any socket errors or till the
40    retry.retry decorator hits it's timeout.
41
42    Usage:
43        If you just want to recreate the proxy:
44        1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error),
45                            timeout_min=<timeout>)
46        2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>)
47            Note: you need to specify the socket.error exception because we
48            want to retry the call after recreating the proxy.
49
50    @param exception_tuple: A tuple of exceptions to pass to
51        the retry decorator. Any of these exceptions will result
52        in retries for the duration of timeout_min.
53    @param timeout_min: The timeout, in minutes, for which we should
54        retry the method ignoring any exception in exception_tuple.
55    """
56    def inner_decorator(method):
57        """Inner retry decorator applied to the method.
58
59        @param method: The method that needs to be wrapped in the decorator.
60
61        @return A wrapper function that implements the retry.
62        """
63
64        @retry.retry(exception_tuple, timeout_min=timeout_min)
65        def wrapper(*args, **kwargs):
66            """This wrapper handles socket errors.
67
68            If the method in question:
69            1. Throws an exception in exception_tuple and it is not a
70               socket.error, retry for timeout_min through retry.retry.
71            2. Throws a socket.error, recreate the client proxy, and
72               retry for timeout_min through retry.retry.
73            3. Throws an exception not in exception_tuple, fail.
74            """
75            try:
76                return method(*args, **kwargs)
77            except socket.error as e:
78                goofy_proxy = args[0]
79                if type(goofy_proxy) is GoofyProxy:
80                    logging.warning('Socket error while running factory tests '
81                                    '%s, recreating goofy proxy.', e)
82                    goofy_proxy._create_client_proxy(timeout_min=timeout_min)
83                else:
84                    logging.warning('Connectivity was lost and the retry '
85                                    'decorator was unable to recreate a goofy '
86                                    'client proxy, args: %s.', args)
87                raise
88
89        return wrapper
90
91    return inner_decorator
92
93
94class GoofyProxy(object):
95    """Client capable of making rpc calls to goofy.
96
97    Methods of this class that can cause goofy to change state
98    usually need a retry decorator. Methods that have a retry decorator
99    need to be 'pure', i.e return the same results when called multiple
100    times with the same argument.
101
102    There are 2 known exceptions this class can deal with, a socket.error
103    which happens when we try to execute an rpc when the DUT is, say, suspended
104    and a BadStatusLine, which we get when we try to execute an rpc while the
105    DUT is going through a factory_restart. Ideally we would like to handle
106    socket timeouts different from BadStatusLines as we can get connection
107    errors even when a device reboots and BadStatusLines ususally only when
108    factory restarts. crbug.com/281714.
109    """
110
111    # This timeout was arbitrarily chosen as many tests in the factory test
112    # suite run for days. Ideally we would like to split this into at least 2
113    # timeouts, one which we use for rpcs that run while no other test is,
114    # running and is smaller than the second that is designed for use with rpcs
115    # that might execute simultaneously with a test. The latter needs a longer
116    # timeout since tests could suspend,resume for a long time, and a call like
117    # GetGoofyStatus should be tolerant to these suspend/resumes. In designing
118    # the base timeout one needs to allocate time to component methods of this
119    # class (such as _set_test_list) as a multiple of the number of rpcs it
120    # executes.
121    BASE_RPC_TIMEOUT = 1440
122    POLLING_INTERVAL = 5
123    FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*'
124    UNTAR_COMMAND = 'tar jxf %s -C %s'
125
126
127    def __init__(self, host):
128        """
129        @param host: The host object representing the DUT running goofy.
130        """
131        self._host = host
132        self._raw_stop_running_tests()
133        self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT)
134
135
136    def _create_client_proxy(self, timeout_min=30):
137        """Create a goofy client proxy.
138
139        Ping the host till it's up, then proceed to create a goofy proxy. We
140        don't wrap this method with a retry because it's used in the retry
141        decorator itself.
142        """
143
144        # We don't ssh ping here as there is a potential dealy in O(minutes)
145        # with our ssh command against a sleeping DUT, once it wakes up, and
146        # that will lead to significant overhead incurred over many reboots.
147        self._host.ping_wait_up(timeout_min)
148        logging.info('Host is pingable, creating goofy client proxy')
149        self._client = self._host.rpc_server_tracker.jsonrpc_connect(
150                GOOFY_JSONRPC_SERVER_PORT)
151
152
153    @retry.retry((httplib.BadStatusLine, socket.error),
154                 timeout_min=BASE_RPC_TIMEOUT)
155    def _raw_stop_running_tests(self):
156        """Stop running tests by shelling out to the DUT.
157
158        Use this method only before we have actually created the client
159        proxy, as shelling out has several pitfalls. We need to stop all
160        tests in a retry loop because tests will start executing as soon
161        as we have reimaged a DUT and trying to create the proxy while
162        the DUT is rebooting will lead to a spurious failure.
163
164        Note that we use the plain retry decorator for this method since
165        we don't need to recreate the client proxy on failure.
166        """
167        logging.info('Stopping all tests and clearing factory state')
168        self._host.run('factory clear')
169
170
171    @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
172                     timeout_min=BASE_RPC_TIMEOUT)
173    def _get_goofy_status(self):
174        """Return status of goofy, ignoring socket timeouts and http exceptions.
175        """
176        status = self._client.GetGoofyStatus().get('status')
177        return status
178
179
180    def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
181        """Wait till goofy is running or a timeout occurs.
182
183        @param timeout_min: Minutes to wait before timing this call out.
184        """
185        current_time = time.time()
186        timeout_secs = timeout_min * 60
187        logging.info('Waiting on goofy')
188        while self._get_goofy_status() != GOOFY_RUNNING:
189            if time.time() - current_time > timeout_secs:
190                break
191        return
192
193
194    @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
195    def _set_test_list(self, next_list):
196        """Set the given test list for execution and turn on test automation.
197
198        Confirm that the given test list is a test that has been baked into
199        the image, then run it. Some test lists are configured to start
200        execution automatically when we call SetTestList, while others wait
201        for a corresponding RunTest.
202
203        @param next_list: The name of the test list.
204
205        @raise jsonrpclib.ProtocolError: If the test list we're trying to switch
206                                         to isn't on the DUT.
207        """
208
209        # We can get a BadStatus line on 2 occassions:
210        # 1. As part of SwitchTestList goofy performs a factory restart, which
211        # will throw a BadStatusLine because the rpc can't exit cleanly. We
212        # don't want to retry on this exception, since we've already set the
213        # right test list.
214        # 2. If we try to set a test list while goofy is already down
215        # (from a previous factory restart). In this case we wouldn't have
216        # set the new test list, because we coulnd't connect to goofy.
217        # To properly set a new test list it's important to wait till goofy is
218        # up before attempting to set the test list, while being aware that the
219        # preceding httplib error is from the rpc we just executed leading to
220        # a factory restart. Also note that if the test list is not already on
221        # the DUT this method will fail, emitting the possible test lists one
222        # can switch to.
223        self._wait_for_goofy()
224        logging.info('Switching to test list %s', next_list)
225        try:
226            # Enable full factory test automation. Full test automation mode
227            # skips all manual tests and test barriers, which is what we want in
228            # the test lab. There are other automation modes: partial and none.
229            # In partial automation mode manual tests and barrier are enabled
230            # and user intervention is required; none disables automation.
231            self._client.SwitchTestList(next_list, 'full')
232        except httplib.BadStatusLine:
233            logging.info('Switched to list %s, goofy restarting', next_list)
234
235
236    @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
237                     timeout_min=BASE_RPC_TIMEOUT*2)
238    def _stop_running_tests(self):
239        """Stop all running tests.
240
241        Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
242        is suspended or rebooting.
243        """
244        logging.info('Stopping tests.')
245        self._client.StopTest()
246
247
248    def _get_test_map(self):
249        """Get a mapping of test suites -> tests.
250
251        Ignore entries for tests that don't have a path.
252
253        @return: A dictionary of the form
254                 {'suite_name': ['suite_name.path_to_test', ...]}.
255        """
256        test_all = set([test['path'] for test in self._client.GetTests()
257                        if test.get('path')])
258
259        test_map = collections.defaultdict(list)
260        for names in test_all:
261            test_map[names.split('.')[0]].append(names)
262        return test_map
263
264
265    def _log_test_results(self, test_status, current_suite):
266        """Format test status results and write them to status.log.
267
268        @param test_status: The status dictionary of a single test.
269        @param current_suite: The current suite name.
270        """
271        try:
272            self._host.job.record('INFO', None, None,
273                                  'suite %s, test %s, status: %s' %
274                                  (current_suite, test_status.get('path'),
275                                   test_status.get('status')))
276        except AttributeError as e:
277            logging.error('Could not gather results for current test: %s', e)
278
279
280    @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
281                     timeout_min=BASE_RPC_TIMEOUT*2)
282    def _get_test_info(self, test_name):
283        """Get the status of one test.
284
285        @param test_name: The name of the test we need the status of.
286
287        @return: The entry for the test in the status dictionary.
288        """
289        for test in self._client.GetTests():
290            if test['path'] == test_name:
291                return test
292        raise ValueError('Could not find test_name %s in _get_test_info.' %
293                          test_name)
294
295
296    @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
297                     timeout_min=BASE_RPC_TIMEOUT*2)
298    def _get_test_run_info(self, run_id):
299        """Get the information about the given test run.
300
301        @param run_id: The ID of the test run.
302
303        @return: A dict of test run status.
304        """
305        return self._client.GetTestRunStatus(run_id)
306
307
308    def _wait_on_run(self, run_id):
309        """Wait until the given test run to end.
310
311        @param run_id: The ID of the test run.
312
313        @raises GoofyRuntimeException: If the test run does not finish
314            gracefully.
315        """
316        finished_tests = set()
317        run_info = self._get_test_run_info(run_id)
318        while run_info['status'] == 'RUNNING':
319            finished = [(t['path'], t['status']) for t in
320                        run_info['scheduled_tests']
321                        if t['status'] in ('PASSED', 'FAILED')]
322            for t in finished:
323                if t not in finished_tests:
324                    logging.info('[%s] %s', t[1], t[0])
325                    finished_tests.add(t)
326            time.sleep(self.POLLING_INTERVAL)
327            run_info = self._get_test_run_info(run_id)
328        if run_info['status'] != 'FINISHED':
329            raise GoofyRuntimeException(
330                    'The requested test run was interrupted.')
331
332
333    def _synchronous_run_suite(self, suite_name):
334        """Run one suite and wait for it to finish.
335
336        Will start a test run for the specified suite_name and wait until it
337        ends.
338
339        @param suite_name: The name of the suite to wait for.
340
341        @raises GoofyProxyException: If the status of the suite
342            doesn't switch to active after we call RunTest.
343
344        @return: The result of the suite.
345        """
346        logging.info('Starting suite: %s', suite_name)
347        run_id = self._client.RunTest(suite_name)
348        self._wait_on_run(run_id)
349        return self._get_test_run_info(run_id)
350
351
352    def monitor_tests(self, test_list):
353        """Run a test list.
354
355        Will run each suite in the given list in sequence, starting each one
356        by name and waiting on its results. This method makes the following
357        assumptions:
358            - A test list is made up of self contained suites.
359            - These suites trigger several things in parallel.
360            - After a suite finishes it leaves goofy in an idle state.
361
362        It is not safe to pull results for individual tests during the suite
363        as the device could be rebooting, or goofy could be under stress.
364        Instead, this method synchronously waits on an entire suite, then
365        asks goofy for the status of each test in the suite. Since certain
366        test lists automatically start and others don't, this method stops
367        test list execution regardless, and sequentially triggers each suite.
368
369        @param test_list: The test list to run.
370        """
371        self._set_test_list(test_list)
372        self._wait_for_goofy()
373        self._stop_running_tests()
374
375        test_map = self._get_test_map()
376        if test_map:
377            logging.info('About to execute tests: %s', test_map)
378        else:
379            raise GoofyRuntimeException('Test map is empty, you might have an '
380                                        'error in your test_list.')
381
382
383        for current_suite in test_map.keys():
384            logging.info('Processing suite %s', current_suite)
385
386            result = self._synchronous_run_suite(current_suite)
387            logging.info(result)
388
389            for test_names in test_map.get(current_suite):
390                self._log_test_results(self._get_test_info(test_names),
391                                       current_suite)
392
393
394    @retry_goofy_rpc((httplib.BadStatusLine, socket.error),
395                     timeout_min=BASE_RPC_TIMEOUT*2)
396    def get_results(self, resultsdir):
397        """Copies results from the DUT to a local results directory.
398
399        Copy the tarball over to the results folder, untar, and delete the
400        tarball if everything was successful. This will effectively place
401        all the logs relevant to factory testing in the job's results folder.
402
403        @param resultsdir: The directory in which to untar the contents of the
404                           tarball factory_bug generates.
405        """
406        logging.info('Getting results logs for test_list.')
407
408        try:
409            factory_bug_log = self._host.run('factory_bug').stderr
410        except error.CmdError as e:
411            logging.error('Could not execute factory_bug: %s', e)
412            return
413
414        try:
415            factory_bug_tar = re.match(self.FACTORY_BUG_RE,
416                                       factory_bug_log).groups(1)[0]
417        except (IndexError, AttributeError):
418            logging.error('could not collect logs for factory results, '
419                          'factory bug returned %s', factory_bug_log)
420            return
421
422        factory_bug_tar_file = os.path.basename(factory_bug_tar)
423        local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
424
425        try:
426            self._host.get_file(factory_bug_tar, local_factory_bug_tar)
427        except error.AutoservRunError as e:
428            logging.error('Failed to pull back the results tarball: %s', e)
429            return
430
431        try:
432            utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
433        except error.CmdError as e:
434            logging.error('Failed to untar the results tarball: %s', e)
435            return
436        finally:
437            if os.path.exists(local_factory_bug_tar):
438                os.remove(local_factory_bug_tar)
439