• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
13from autotest_lib.client.common_lib import global_config
14from autotest_lib.client.common_lib import time_utils
15from autotest_lib.server import utils
16from autotest_lib.server.cros.dynamic_suite import reporting_utils
17from autotest_lib.server.lib import status_history
18
19CONFIG = global_config.global_config
20
21
22class DUTsNotAvailableError(utils.TestLabException):
23    """Raised when a DUT label combination is not available in the lab."""
24
25
26class NotEnoughDutsError(utils.TestLabException):
27    """Rasied when the lab doesn't have the minimum number of duts."""
28
29    def __init__(self, labels, num_available, num_required, hosts):
30        """Initialize instance.
31
32        Please pass arguments by keyword.
33
34        @param labels: Labels required, including board an pool labels.
35        @param num_available: Number of available hosts.
36        @param num_required: Number of hosts required.
37        @param hosts: Sequence of Host instances for given board and pool.
38        """
39        self.labels = labels
40        self.num_available = num_available
41        self.num_required = num_required
42        self.hosts = hosts
43        self.bug_id = None
44        self.suite_name = None
45        self.build = None
46
47
48    def __repr__(self):
49        return (
50            '<{cls} at 0x{id:x} with'
51            ' labels={this.labels!r},'
52            ' num_available={this.num_available!r},'
53            ' num_required={this.num_required!r},'
54            ' bug_id={this.bug_id!r},'
55            ' suite_name={this.suite_name!r},'
56            ' build={this.build!r}>'
57            .format(cls=type(self).__name__, id=id(self), this=self)
58        )
59
60
61    def __str__(self):
62        msg_parts = [
63            'Not enough DUTs for requirements: {this.labels};'
64            ' required: {this.num_required}, found: {this.num_available}'
65        ]
66        format_dict = {'this': self}
67        if self.bug_id is not None:
68            msg_parts.append('bug: {bug_url}')
69            format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
70        if self.suite_name is not None:
71            msg_parts.append('suite: {this.suite_name}')
72        if self.build is not None:
73            msg_parts.append('build: {this.build}')
74        return ', '.join(msg_parts).format(**format_dict)
75
76
77    def add_bug_id(self, bug_id):
78        """Add crbug id associated with this exception.
79
80        @param bug_id  crbug id whose str() value is used in a crbug URL.
81        """
82        self.bug_id = bug_id
83
84
85    def add_suite_name(self, suite_name):
86        """Add name of test suite that needed the DUTs.
87
88        @param suite_name  Name of test suite.
89        """
90        self.suite_name = suite_name
91
92
93    def add_build(self, build):
94        """Add name of build of job that needed the DUTs.
95
96        @param build  Name of build.
97        """
98        self.build = build
99
100
101class SimpleTimer(object):
102    """A simple timer used to periodically check if a deadline has passed."""
103
104    def _reset(self):
105        """Reset the deadline."""
106        if not self.interval_hours or self.interval_hours < 0:
107            logging.error('Bad interval %s', self.interval_hours)
108            self.deadline = None
109            return
110        self.deadline = datetime.now() + datetime_base.timedelta(
111                hours=self.interval_hours)
112
113
114    def __init__(self, interval_hours=0.5):
115        """Initialize a simple periodic deadline timer.
116
117        @param interval_hours: Interval of the deadline.
118        """
119        self.interval_hours = interval_hours
120        self._reset()
121
122
123    def poll(self):
124        """Poll the timer to see if we've hit the deadline.
125
126        This method resets the deadline if it has passed. If the deadline
127        hasn't been set, or the current time is less than the deadline, the
128        method returns False.
129
130        @return: True if the deadline has passed, False otherwise.
131        """
132        if not self.deadline or datetime.now() < self.deadline:
133            return False
134        self._reset()
135        return True
136
137
138class JobTimer(object):
139    """Utility class capable of measuring job timeouts.
140    """
141
142    # Format used in datetime - string conversion.
143    time_format = '%m-%d-%Y [%H:%M:%S]'
144
145    def __init__(self, job_created_time, timeout_mins):
146        """JobTimer constructor.
147
148        @param job_created_time: float representing the time a job was
149            created. Eg: time.time()
150        @param timeout_mins: float representing the timeout in minutes.
151        """
152        self.job_created_time = datetime.fromtimestamp(job_created_time)
153        self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
154        self.debug_output_timer = SimpleTimer(interval_hours=0.5)
155        self.past_halftime = False
156
157
158    @classmethod
159    def format_time(cls, datetime_obj):
160        """Get the string formatted version of the datetime object.
161
162        @param datetime_obj: A datetime.datetime object.
163            Eg: datetime.datetime.now()
164
165        @return: A formatted string containing the date/time of the
166            input datetime.
167        """
168        return datetime_obj.strftime(cls.time_format)
169
170
171    def elapsed_time(self):
172        """Get the time elapsed since this job was created.
173
174        @return: A timedelta object representing the elapsed time.
175        """
176        return datetime.now() - self.job_created_time
177
178
179    def is_suite_timeout(self):
180        """Check if the suite timed out.
181
182        @return: True if more than timeout_hours has elapsed since the suite job
183            was created.
184        """
185        if self.elapsed_time() >= self.timeout_hours:
186            logging.info('Suite timed out. Started on %s, timed out on %s',
187                         self.format_time(self.job_created_time),
188                         self.format_time(datetime.now()))
189            return True
190        return False
191
192
193    def first_past_halftime(self):
194        """Check if we just crossed half time.
195
196        This method will only return True once, the first time it is called
197        after a job's elapsed time is past half its timeout.
198
199        @return True: If this is the first call of the method after halftime.
200        """
201        if (not self.past_halftime and
202            self.elapsed_time() > self.timeout_hours/2):
203            self.past_halftime = True
204            return True
205        return False
206
207
208class RPCHelper(object):
209    """A class to help diagnose a suite run through the rpc interface.
210    """
211
212    def __init__(self, rpc_interface):
213        """Constructor for rpc helper class.
214
215        @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
216        """
217        self.rpc_interface = rpc_interface
218
219
220    def diagnose_pool(self, labels, time_delta_hours, limit=10):
221        """Log diagnostic information about a timeout for a board/pool.
222
223        @param labels: DUT label dependencies, including board and pool
224                       labels.
225        @param time_delta_hours: The time from which we should log information.
226            This is a datetime.timedelta object, as stored by the JobTimer.
227        @param limit: The maximum number of jobs per host, to log.
228
229        @raises proxy.JSONRPCException: For exceptions thrown across the wire.
230        """
231        end_time = datetime.now()
232        start_time = end_time - time_delta_hours
233        host_histories = status_history.HostJobHistory.get_multiple_histories(
234                self.rpc_interface,
235                time_utils.to_epoch_time(start_time),
236                time_utils.to_epoch_time(end_time),
237                labels,
238        )
239        if not host_histories:
240            logging.error('No hosts found for labels %r', labels)
241            return
242        status_map = {
243            status_history.UNUSED: 'Unused',
244            status_history.UNKNOWN: 'No job history',
245            status_history.WORKING: 'Working',
246            status_history.BROKEN: 'Failed repair'
247        }
248        for history in host_histories:
249            count = 0
250            job_info =''
251            for job in history:
252                start_time = (
253                        time_utils.epoch_time_to_date_string(job.start_time))
254                job_info += ('%s %s started on: %s status %s\n' %
255                        (job.id, job.name, start_time, job.job_status))
256                count += 1
257                if count >= limit:
258                    break
259            host = history.host
260            logging.error('host: %s, status: %s, locked: %s '
261                          'diagnosis: %s\n'
262                          'labels: %s\nLast %s jobs within %s:\n'
263                          '%s',
264                          history.hostname, host.status, host.locked,
265                          status_map[history.last_diagnosis()[0]],
266                          host.labels, limit, time_delta_hours,
267                          job_info)
268
269
270    def check_dut_availability(self, labels, minimum_duts=0,
271                               skip_duts_check=False):
272        """Check if DUT availability for a given board and pool is less than
273        minimum.
274
275        @param labels: DUT label dependencies, including board and pool
276                       labels.
277        @param minimum_duts: Minimum Number of available machines required to
278                             run the suite. Default is set to 0, which means do
279                             not force the check of available machines before
280                             running the suite.
281        @param skip_duts_check: If True, skip minimum available DUTs check.
282        @raise: NotEnoughDutsError if DUT availability is lower than minimum.
283        @raise: DUTsNotAvailableError if no host found for requested
284                board/pool.
285        """
286        if minimum_duts == 0:
287            return
288
289        hosts = self.rpc_interface.get_hosts(
290                invalid=False, multiple_labels=labels)
291        if not hosts:
292            raise DUTsNotAvailableError(
293                    'No hosts found for labels %r. The test lab '
294                    'currently does not cover test for those DUTs.' %
295                    (labels,))
296
297        if skip_duts_check:
298            # Bypass minimum avilable DUTs check
299            logging.debug('skip_duts_check is on, do not enforce minimum '
300                          'DUTs check.')
301            return
302
303        if len(hosts) < minimum_duts:
304            logging.debug('The total number of DUTs for %r is %d, '
305                          'which is less than %d, the required minimum '
306                          'number of available DUTS', labels, len(hosts),
307                          minimum_duts)
308
309        available_hosts = 0
310        for host in hosts:
311            if host.is_available():
312                available_hosts += 1
313        logging.debug('%d of %d DUTs are available for %r.',
314                      available_hosts, len(hosts), labels)
315        if available_hosts < minimum_duts:
316            raise NotEnoughDutsError(
317                labels=labels,
318                num_available=available_hosts,
319                num_required=minimum_duts,
320                hosts=hosts)
321
322
323    def diagnose_job(self, job_id, instance_server):
324        """Diagnose a suite job.
325
326        Logs information about the jobs that are still to run in the suite.
327
328        @param job_id: The id of the suite job to get information about.
329            No meaningful information gets logged if the id is for a sub-job.
330        @param instance_server: The instance server.
331            Eg: cautotest, cautotest-cq, localhost.
332        """
333        incomplete_jobs = self.rpc_interface.get_jobs(
334                parent_job_id=job_id, summary=True,
335                hostqueueentry__complete=False)
336        if incomplete_jobs:
337            logging.info('\n%s printing summary of incomplete jobs (%s):\n',
338                         JobTimer.format_time(datetime.now()),
339                         len(incomplete_jobs))
340            for job in incomplete_jobs:
341                logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
342                             reporting_utils.link_job(job.id, instance_server))
343        else:
344            logging.info('All jobs in suite have already completed.')
345