• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
13from autotest_lib.client.common_lib import global_config
14from autotest_lib.client.common_lib import time_utils
15from autotest_lib.server import utils
16from autotest_lib.server.cros.dynamic_suite import reporting_utils
17from autotest_lib.server.lib import status_history
18from autotest_lib.utils import labellib
19
20CONFIG = global_config.global_config
21
22
23class BoardNotAvailableError(utils.TestLabException):
24    """Raised when a board is not available in the lab."""
25
26
27class NotEnoughDutsError(utils.TestLabException):
28    """Rasied when the lab doesn't have the minimum number of duts."""
29
30    def __init__(self, board, pool, num_available, num_required, hosts):
31        """Initialize instance.
32
33        Please pass arguments by keyword.
34
35        @param board: Name of board.
36        @param pool: Name of pool.
37        @param num_available: Number of available hosts.
38        @param num_required: Number of hosts required.
39        @param hosts: Sequence of Host instances for given board and pool.
40        """
41        self.board = board
42        self.pool = pool
43        self.num_available = num_available
44        self.num_required = num_required
45        self.hosts = hosts
46        self.bug_id = None
47        self.suite_name = None
48        self.build = None
49
50
51    def __repr__(self):
52        return (
53            '<{cls} at 0x{id:x} with'
54            ' board={this.board!r},'
55            ' pool={this.pool!r},'
56            ' num_available={this.num_available!r},'
57            ' num_required={this.num_required!r},'
58            ' bug_id={this.bug_id!r},'
59            ' suite_name={this.suite_name!r},'
60            ' build={this.build!r}>'
61            .format(cls=type(self).__name__, id=id(self), this=self)
62        )
63
64
65    def __str__(self):
66        msg_parts = [
67            'Not enough DUTs for board: {this.board}, pool: {this.pool};'
68            ' required: {this.num_required}, found: {this.num_available}'
69        ]
70        format_dict = {'this': self}
71        if self.bug_id is not None:
72            msg_parts.append('bug: {bug_url}')
73            format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
74        if self.suite_name is not None:
75            msg_parts.append('suite: {this.suite_name}')
76        if self.build is not None:
77            msg_parts.append('build: {this.build}')
78        return ', '.join(msg_parts).format(**format_dict)
79
80
81    def add_bug_id(self, bug_id):
82        """Add crbug id associated with this exception.
83
84        @param bug_id  crbug id whose str() value is used in a crbug URL.
85        """
86        self.bug_id = bug_id
87
88
89    def add_suite_name(self, suite_name):
90        """Add name of test suite that needed the DUTs.
91
92        @param suite_name  Name of test suite.
93        """
94        self.suite_name = suite_name
95
96
97    def add_build(self, build):
98        """Add name of build of job that needed the DUTs.
99
100        @param build  Name of build.
101        """
102        self.build = build
103
104
105class SimpleTimer(object):
106    """A simple timer used to periodically check if a deadline has passed."""
107
108    def _reset(self):
109        """Reset the deadline."""
110        if not self.interval_hours or self.interval_hours < 0:
111            logging.error('Bad interval %s', self.interval_hours)
112            self.deadline = None
113            return
114        self.deadline = datetime.now() + datetime_base.timedelta(
115                hours=self.interval_hours)
116
117
118    def __init__(self, interval_hours=0.5):
119        """Initialize a simple periodic deadline timer.
120
121        @param interval_hours: Interval of the deadline.
122        """
123        self.interval_hours = interval_hours
124        self._reset()
125
126
127    def poll(self):
128        """Poll the timer to see if we've hit the deadline.
129
130        This method resets the deadline if it has passed. If the deadline
131        hasn't been set, or the current time is less than the deadline, the
132        method returns False.
133
134        @return: True if the deadline has passed, False otherwise.
135        """
136        if not self.deadline or datetime.now() < self.deadline:
137            return False
138        self._reset()
139        return True
140
141
142class JobTimer(object):
143    """Utility class capable of measuring job timeouts.
144    """
145
146    # Format used in datetime - string conversion.
147    time_format = '%m-%d-%Y [%H:%M:%S]'
148
149    def __init__(self, job_created_time, timeout_mins):
150        """JobTimer constructor.
151
152        @param job_created_time: float representing the time a job was
153            created. Eg: time.time()
154        @param timeout_mins: float representing the timeout in minutes.
155        """
156        self.job_created_time = datetime.fromtimestamp(job_created_time)
157        self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
158        self.debug_output_timer = SimpleTimer(interval_hours=0.5)
159        self.past_halftime = False
160
161
162    @classmethod
163    def format_time(cls, datetime_obj):
164        """Get the string formatted version of the datetime object.
165
166        @param datetime_obj: A datetime.datetime object.
167            Eg: datetime.datetime.now()
168
169        @return: A formatted string containing the date/time of the
170            input datetime.
171        """
172        return datetime_obj.strftime(cls.time_format)
173
174
175    def elapsed_time(self):
176        """Get the time elapsed since this job was created.
177
178        @return: A timedelta object representing the elapsed time.
179        """
180        return datetime.now() - self.job_created_time
181
182
183    def is_suite_timeout(self):
184        """Check if the suite timed out.
185
186        @return: True if more than timeout_hours has elapsed since the suite job
187            was created.
188        """
189        if self.elapsed_time() >= self.timeout_hours:
190            logging.info('Suite timed out. Started on %s, timed out on %s',
191                         self.format_time(self.job_created_time),
192                         self.format_time(datetime.now()))
193            return True
194        return False
195
196
197    def first_past_halftime(self):
198        """Check if we just crossed half time.
199
200        This method will only return True once, the first time it is called
201        after a job's elapsed time is past half its timeout.
202
203        @return True: If this is the first call of the method after halftime.
204        """
205        if (not self.past_halftime and
206            self.elapsed_time() > self.timeout_hours/2):
207            self.past_halftime = True
208            return True
209        return False
210
211
212class RPCHelper(object):
213    """A class to help diagnose a suite run through the rpc interface.
214    """
215
216    def __init__(self, rpc_interface):
217        """Constructor for rpc helper class.
218
219        @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
220        """
221        self.rpc_interface = rpc_interface
222
223
224    def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
225        """Log diagnostic information about a timeout for a board/pool.
226
227        @param board: The board for which the current suite was run.
228        @param pool: The pool against which the current suite was run.
229        @param time_delta_hours: The time from which we should log information.
230            This is a datetime.timedelta object, as stored by the JobTimer.
231        @param limit: The maximum number of jobs per host, to log.
232
233        @raises proxy.JSONRPCException: For exceptions thrown across the wire.
234        """
235        end_time = datetime.now()
236        start_time = end_time - time_delta_hours
237        labels = labellib.LabelsMapping()
238        labels['board'] = board
239        labels['pool'] = pool
240        host_histories = status_history.HostJobHistory.get_multiple_histories(
241                self.rpc_interface,
242                time_utils.to_epoch_time(start_time),
243                time_utils.to_epoch_time(end_time),
244                labels.getlabels(),
245        )
246        if not host_histories:
247            logging.error('No hosts found for board:%s in pool:%s',
248                            board, pool)
249            return
250        status_map = {
251            status_history.UNUSED: 'Unused',
252            status_history.UNKNOWN: 'No job history',
253            status_history.WORKING: 'Working',
254            status_history.BROKEN: 'Failed repair'
255        }
256        for history in host_histories:
257            count = 0
258            job_info =''
259            for job in history:
260                start_time = (
261                        time_utils.epoch_time_to_date_string(job.start_time))
262                job_info += ('%s %s started on: %s status %s\n' %
263                        (job.id, job.name, start_time, job.job_status))
264                count += 1
265                if count >= limit:
266                    break
267            host = history.host
268            logging.error('host: %s, status: %s, locked: %s '
269                          'diagnosis: %s\n'
270                          'labels: %s\nLast %s jobs within %s:\n'
271                          '%s',
272                          history.hostname, host.status, host.locked,
273                          status_map[history.last_diagnosis()[0]],
274                          host.labels, limit, time_delta_hours,
275                          job_info)
276
277
278    def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
279        """Check if DUT availability for a given board and pool is less than
280        minimum.
281
282        @param board: The board to check DUT availability.
283        @param pool: The pool to check DUT availability.
284        @param minimum_duts: Minimum Number of available machines required to
285                             run the suite. Default is set to 0, which means do
286                             not force the check of available machines before
287                             running the suite.
288        @param skip_duts_check: If True, skip minimum available DUTs check.
289        @raise: NotEnoughDutsError if DUT availability is lower than minimum.
290        @raise: BoardNotAvailableError if no host found for requested
291                board/pool.
292        """
293        if minimum_duts == 0:
294            return
295
296        # TODO(ayatane): Replace label prefixes with constants in
297        # autotest_lib.server.constants
298        hosts = self.rpc_interface.get_hosts(
299                invalid=False,
300                multiple_labels=('pool:%s' % pool, 'board:%s' % board))
301        if not hosts:
302            raise BoardNotAvailableError(
303                    'No hosts found for board:%s in pool:%s. The test lab '
304                    'currently does not cover test for this board and pool.'%
305                    (board, pool))
306
307        if skip_duts_check:
308            # Bypass minimum avilable DUTs check
309            logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
310            return
311
312        if len(hosts) < minimum_duts:
313            logging.debug('The total number of DUTs for %s in pool:%s is %d, '
314                          'which is less than %d, the required minimum number of'
315                          ' available DUTS', board, pool, len(hosts),
316                          minimum_duts)
317
318        available_hosts = 0
319        for host in hosts:
320            if host.is_available():
321                available_hosts += 1
322        logging.debug('%d of %d DUTs are available for board %s pool %s.',
323                      available_hosts, len(hosts), board, pool)
324        if available_hosts < minimum_duts:
325            raise NotEnoughDutsError(
326                board=board,
327                pool=pool,
328                num_available=available_hosts,
329                num_required=minimum_duts,
330                hosts=hosts)
331
332
333    def diagnose_job(self, job_id, instance_server):
334        """Diagnose a suite job.
335
336        Logs information about the jobs that are still to run in the suite.
337
338        @param job_id: The id of the suite job to get information about.
339            No meaningful information gets logged if the id is for a sub-job.
340        @param instance_server: The instance server.
341            Eg: cautotest, cautotest-cq, localhost.
342        """
343        incomplete_jobs = self.rpc_interface.get_jobs(
344                parent_job_id=job_id, summary=True,
345                hostqueueentry__complete=False)
346        if incomplete_jobs:
347            logging.info('\n%s printing summary of incomplete jobs (%s):\n',
348                         JobTimer.format_time(datetime.now()),
349                         len(incomplete_jobs))
350            for job in incomplete_jobs:
351                logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
352                             reporting_utils.link_job(job.id, instance_server))
353        else:
354            logging.info('All jobs in suite have already completed.')
355