• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
13from autotest_lib.client.common_lib import global_config
14from autotest_lib.client.common_lib import host_states
15from autotest_lib.client.common_lib import time_utils
16from autotest_lib.server import utils
17from autotest_lib.server.cros.dynamic_suite import reporting_utils
18from autotest_lib.server.lib import status_history
19
20CONFIG = global_config.global_config
21
22
23class BoardNotAvailableError(utils.TestLabException):
24    """Raised when a board is not available in the lab."""
25
26
27class NotEnoughDutsError(utils.TestLabException):
28    """Rasied when the lab doesn't have the minimum number of duts."""
29
30    def __init__(self, board, pool, num_available, num_required, hosts):
31        """Initialize instance.
32
33        Please pass arguments by keyword.
34
35        @param board: Name of board.
36        @param pool: Name of pool.
37        @param num_available: Number of available hosts.
38        @param num_required: Number of hosts required.
39        @param hosts: Sequence of Host instances for given board and pool.
40        """
41        self.board = board
42        self.pool = pool
43        self.num_available = num_available
44        self.num_required = num_required
45        self.hosts = hosts
46        self.bug_id = None
47        self.suite_name = None
48        self.build = None
49
50
51    def __repr__(self):
52        return (
53            '<{cls} at 0x{id:x} with'
54            ' board={this.board!r},'
55            ' pool={this.pool!r},'
56            ' num_available={this.num_available!r},'
57            ' num_required={this.num_required!r},'
58            ' bug_id={this.bug_id!r},'
59            ' suite_name={this.suite_name!r},'
60            ' build={this.build!r}>'
61            .format(cls=type(self).__name__, id=id(self), this=self)
62        )
63
64
65    def __str__(self):
66        msg_parts = [
67            'Not enough DUTs for board: {this.board}, pool: {this.pool};'
68            ' required: {this.num_required}, found: {this.num_available}'
69        ]
70        format_dict = {'this': self}
71        if self.bug_id is not None:
72            msg_parts.append('bug: {bug_url}')
73            format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
74        if self.suite_name is not None:
75            msg_parts.append('suite: {this.suite_name}')
76        if self.build is not None:
77            msg_parts.append('build: {this.build}')
78        return ', '.join(msg_parts).format(**format_dict)
79
80
81    def add_bug_id(self, bug_id):
82        """Add crbug id associated with this exception.
83
84        @param bug_id  crbug id whose str() value is used in a crbug URL.
85        """
86        self.bug_id = bug_id
87
88
89    def add_suite_name(self, suite_name):
90        """Add name of test suite that needed the DUTs.
91
92        @param suite_name  Name of test suite.
93        """
94        self.suite_name = suite_name
95
96
97    def add_build(self, build):
98        """Add name of build of job that needed the DUTs.
99
100        @param build  Name of build.
101        """
102        self.build = build
103
104
105class SimpleTimer(object):
106    """A simple timer used to periodically check if a deadline has passed."""
107
108    def _reset(self):
109        """Reset the deadline."""
110        if not self.interval_hours or self.interval_hours < 0:
111            logging.error('Bad interval %s', self.interval_hours)
112            self.deadline = None
113            return
114        self.deadline = datetime.now() + datetime_base.timedelta(
115                hours=self.interval_hours)
116
117
118    def __init__(self, interval_hours=0.5):
119        """Initialize a simple periodic deadline timer.
120
121        @param interval_hours: Interval of the deadline.
122        """
123        self.interval_hours = interval_hours
124        self._reset()
125
126
127    def poll(self):
128        """Poll the timer to see if we've hit the deadline.
129
130        This method resets the deadline if it has passed. If the deadline
131        hasn't been set, or the current time is less than the deadline, the
132        method returns False.
133
134        @return: True if the deadline has passed, False otherwise.
135        """
136        if not self.deadline or datetime.now() < self.deadline:
137            return False
138        self._reset()
139        return True
140
141
142class JobTimer(object):
143    """Utility class capable of measuring job timeouts.
144    """
145
146    # Format used in datetime - string conversion.
147    time_format = '%m-%d-%Y [%H:%M:%S]'
148
149    def __init__(self, job_created_time, timeout_mins):
150        """JobTimer constructor.
151
152        @param job_created_time: float representing the time a job was
153            created. Eg: time.time()
154        @param timeout_mins: float representing the timeout in minutes.
155        """
156        self.job_created_time = datetime.fromtimestamp(job_created_time)
157        self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
158        self.debug_output_timer = SimpleTimer(interval_hours=0.5)
159        self.past_halftime = False
160
161
162    @classmethod
163    def format_time(cls, datetime_obj):
164        """Get the string formatted version of the datetime object.
165
166        @param datetime_obj: A datetime.datetime object.
167            Eg: datetime.datetime.now()
168
169        @return: A formatted string containing the date/time of the
170            input datetime.
171        """
172        return datetime_obj.strftime(cls.time_format)
173
174
175    def elapsed_time(self):
176        """Get the time elapsed since this job was created.
177
178        @return: A timedelta object representing the elapsed time.
179        """
180        return datetime.now() - self.job_created_time
181
182
183    def is_suite_timeout(self):
184        """Check if the suite timed out.
185
186        @return: True if more than timeout_hours has elapsed since the suite job
187            was created.
188        """
189        if self.elapsed_time() >= self.timeout_hours:
190            logging.info('Suite timed out. Started on %s, timed out on %s',
191                         self.format_time(self.job_created_time),
192                         self.format_time(datetime.now()))
193            return True
194        return False
195
196
197    def first_past_halftime(self):
198        """Check if we just crossed half time.
199
200        This method will only return True once, the first time it is called
201        after a job's elapsed time is past half its timeout.
202
203        @return True: If this is the first call of the method after halftime.
204        """
205        if (not self.past_halftime and
206            self.elapsed_time() > self.timeout_hours/2):
207            self.past_halftime = True
208            return True
209        return False
210
211
212class RPCHelper(object):
213    """A class to help diagnose a suite run through the rpc interface.
214    """
215
216    def __init__(self, rpc_interface):
217        """Constructor for rpc helper class.
218
219        @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
220        """
221        self.rpc_interface = rpc_interface
222
223
224    def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
225        """Log diagnostic information about a timeout for a board/pool.
226
227        @param board: The board for which the current suite was run.
228        @param pool: The pool against which the current suite was run.
229        @param time_delta_hours: The time from which we should log information.
230            This is a datetime.timedelta object, as stored by the JobTimer.
231        @param limit: The maximum number of jobs per host, to log.
232
233        @raises proxy.JSONRPCException: For exceptions thrown across the wire.
234        """
235        end_time = datetime.now()
236        start_time = end_time - time_delta_hours
237        get_histories = status_history.HostJobHistory.get_multiple_histories
238        host_histories = get_histories(
239                self.rpc_interface,
240                time_utils.to_epoch_time(start_time),
241                time_utils.to_epoch_time(end_time),
242                board=board, pool=pool)
243        if not host_histories:
244            logging.error('No hosts found for board:%s in pool:%s',
245                            board, pool)
246            return
247        status_map = {
248            status_history.UNUSED: 'Unused',
249            status_history.UNKNOWN: 'No job history',
250            status_history.WORKING: 'Working',
251            status_history.BROKEN: 'Failed repair'
252        }
253        for history in host_histories:
254            count = 0
255            job_info =''
256            for job in history:
257                start_time = (
258                        time_utils.epoch_time_to_date_string(job.start_time))
259                job_info += ('%s %s started on: %s status %s\n' %
260                        (job.id, job.name, start_time, job.job_status))
261                count += 1
262                if count >= limit:
263                    break
264            host = history.host
265            logging.error('host: %s, status: %s, locked: %s '
266                          'diagnosis: %s\n'
267                          'labels: %s\nLast %s jobs within %s:\n'
268                          '%s',
269                          history.hostname, host.status, host.locked,
270                          status_map[history.last_diagnosis()[0]],
271                          host.labels, limit, time_delta_hours,
272                          job_info)
273
274
275    def _is_host_available(self, host):
276        """Check whether DUT host is available.
277
278        @param host: The Host instance for the DUT.
279        @return: bool
280        """
281        return not (host.locked or host.status in host_states.UNAVAILABLE_STATES)
282
283
284    def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
285        """Check if DUT availability for a given board and pool is less than
286        minimum.
287
288        @param board: The board to check DUT availability.
289        @param pool: The pool to check DUT availability.
290        @param minimum_duts: Minimum Number of available machines required to
291                             run the suite. Default is set to 0, which means do
292                             not force the check of available machines before
293                             running the suite.
294        @param skip_duts_check: If True, skip minimum available DUTs check.
295        @raise: NotEnoughDutsError if DUT availability is lower than minimum.
296        @raise: BoardNotAvailableError if no host found for requested
297                board/pool.
298        """
299        if minimum_duts == 0:
300            return
301
302        # TODO(ayatane): Replace label prefixes with constants in
303        # site_utils.suite_scheduler.constants
304        hosts = self.rpc_interface.get_hosts(
305                invalid=False,
306                multiple_labels=('pool:%s' % pool, 'board:%s' % board))
307        if not hosts:
308            raise BoardNotAvailableError(
309                    'No hosts found for board:%s in pool:%s. The test lab '
310                    'currently does not cover test for this board and pool.'%
311                    (board, pool))
312
313        if skip_duts_check:
314            # Bypass minimum avilable DUTs check
315            logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
316            return
317
318        if len(hosts) < minimum_duts:
319            logging.debug('The total number of DUTs for %s in pool:%s is %d, '
320                          'which is less than %d, the required minimum number of'
321                          ' available DUTS', board, pool, len(hosts),
322                          minimum_duts)
323
324        available_hosts = 0
325        for host in hosts:
326            if self._is_host_available(host):
327                available_hosts += 1
328        logging.debug('%d of %d DUTs are available for board %s pool %s.',
329                      available_hosts, len(hosts), board, pool)
330        if available_hosts < minimum_duts:
331            raise NotEnoughDutsError(
332                board=board,
333                pool=pool,
334                num_available=available_hosts,
335                num_required=minimum_duts,
336                hosts=hosts)
337
338
339    def diagnose_job(self, job_id, instance_server):
340        """Diagnose a suite job.
341
342        Logs information about the jobs that are still to run in the suite.
343
344        @param job_id: The id of the suite job to get information about.
345            No meaningful information gets logged if the id is for a sub-job.
346        @param instance_server: The instance server.
347            Eg: cautotest, cautotest-cq, localhost.
348        """
349        incomplete_jobs = self.rpc_interface.get_jobs(
350                parent_job_id=job_id, summary=True,
351                hostqueueentry__complete=False)
352        if incomplete_jobs:
353            logging.info('\n%s printing summary of incomplete jobs (%s):\n',
354                         JobTimer.format_time(datetime.now()),
355                         len(incomplete_jobs))
356            for job in incomplete_jobs:
357                logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
358                             reporting_utils.link_job(job.id, instance_server))
359        else:
360            logging.info('All jobs in suite have already completed.')
361