• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5# This file lets us test the repair supporting code.
6# We could not easily unit test it if it was in the repair file as it makes
7# a function call that is not protected by a __name__ == ??? guard.
8
9import datetime, getpass, logging, operator, smtplib, urllib2, xmlrpclib
10
11import common
12
13from autotest_lib.client.common_lib import global_config, mail, logging_config
14from autotest_lib.server import frontend
15from autotest_lib.server.cros.dynamic_suite import reporting
16
17
18# Receiver and sender information, if we need to send an email
19_NOTIFY_ADDRESS = global_config.global_config.get_config_value(
20    'SCHEDULER', 'notify_email_errors', default='')
21_SENDER_ADDRESS = global_config.global_config.get_config_value(
22    'SCHEDULER', "notify_email_from", default=getpass.getuser())
23
24# Ignore any jobs that were ran more than this many mins past the max job
25# timeout.
26_CUTOFF_AFTER_TIMEOUT_MINS = 60
27_DEFAULT_TEST_TIMEOUT_MINS = global_config.global_config.get_config_value(
28    'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int,
29    default=0)
30
31
32class MachineDeathLogger(logging_config.LoggingConfig):
33    """
34    Used to log information about a machine going into the Repair Failed state.
35
36    We use this so that if the default log location ever changes it will also
37    change for this logger and to keep this information separate from the
38    other logs.
39
40    """
41    file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s',
42                                       datefmt='%m/%d %H:%M:%S')
43    LOGFILE_NAME = 'machine_death.log'
44
45    def __init__(self):
46        super(MachineDeathLogger, self).__init__(False)
47        self.logger = logging.getLogger('machine_death')
48
49        super(MachineDeathLogger, self).configure_logging(use_console=False)
50        log_dir = self.get_server_log_dir()
51        self.add_file_handler(self.LOGFILE_NAME, logging.ERROR,
52                              log_dir=log_dir)
53
54
55def _find_problem_test(machine, rpc):
56    """
57    Find the last job that ran on the machine.
58
59    Go as far back as _DEFAULT_TEST_TIMEOUT_MINS + _CUTOFF_AFTER_TIMEOUT_MINS.
60    If global_config doesn't have a job_max_runtime_mins_default we will search
61    only as far as _CUTOFF_AFTER_TIMEOUT_MINS.
62
63    @param machine: The hostname (e.g. IP address) of the machine to find the
64        last ran job on it.
65
66    @param rpc: The rpc object to contact the server with.
67
68    @return the job status dictionary for the job that last ran on the machine
69        or None if there is no such job.
70    """
71
72    # Going through the RPC interface means we cannot use the latest() django
73    # QuerySet function. So we will instead look at the past
74    # job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS
75    # and pick the most recent run from there.
76    cutoff = (datetime.datetime.today() -
77              datetime.timedelta(minutes=_DEFAULT_TEST_TIMEOUT_MINS) -
78              datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS))
79
80    results = rpc.run('get_host_queue_entries', host__hostname=machine,
81                      started_on__gte=str(cutoff))
82
83    if results:
84        return max(results, key=operator.itemgetter('started_on'))
85    else:
86        return None
87
88
89def flag_problem_test(machine):
90    """
91    Notify people about the last job that ran on a machine.
92
93    This method is invoked everytime a machine fails to repair, and attempts
94    to identify the last test that ran on the machine. If successfull, it files
95    a bug, or sends out an email, or just logs the fact.
96
97    @param machine: The hostname (e.g. IP address) of the machine to find the
98        last job ran on it.
99
100    """
101    rpc = frontend.AFE()
102    logger = MachineDeathLogger()
103
104    try:
105        problem_test = _find_problem_test(machine, rpc)
106    except (urllib2.URLError, xmlrpclib.ProtocolError):
107        logger.logger.error('%s | ERROR: Could not contact RPC server'
108                            % machine)
109        return
110
111    if problem_test:
112        job_id = problem_test['job']['id']
113        job_name = problem_test['job']['name']
114        bug = reporting.MachineKillerBug(job_id=job_id,
115                                         job_name=job_name,
116                                         machine=machine)
117        reporter = reporting.Reporter()
118        bug_id = reporter.report(bug)[0]
119
120        if bug_id is None:
121            try:
122                email_prefix = ('The following test is killing a machine, '
123                                'could not file a bug to report this:\n\n')
124                mail.send(_SENDER_ADDRESS, _NOTIFY_ADDRESS, '',
125                          bug.title(), email_prefix + bug.summary())
126            except smtplib.SMTPDataError:
127                logger.logger.error('%s | %d | %s'
128                                    % (machine, job_id, job_name))
129