• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2
3import sys, optparse, pwd
4import common
5from autotest_lib.cli import rpc, host
6from autotest_lib.client.common_lib import host_queue_entry_states
7
8parser = optparse.OptionParser(
9    usage='Usage: %prog [options] <job id> [<hostname>]\n\n'
10          'Describes why the given job on the given host has not started.')
11parser.add_option('-w', '--web',
12                  help='Autotest server to use (i.e. "autotest")')
13options, args = parser.parse_args()
14
15if len(args) < 1:
16    parser.print_help()
17    sys.exit(1)
18
19job_id = int(args[0])
20
21autotest_host = rpc.get_autotest_server(options.web)
22proxy = rpc.afe_comm(autotest_host)
23
24# job exists?
25jobs = proxy.run('get_jobs', id=job_id)
26if not jobs:
27    print 'No such job', job_id
28    sys.exit(1)
29job = jobs[0]
30owner = job['owner']
31
32RUNNING_HQE_STATUSES = host_queue_entry_states.ACTIVE_STATUSES
33
34# any entry eligible for this host?
35queue_entries = proxy.run('get_host_queue_entries', job__id=job_id)
36
37### Divine why an atomic group job is or is not running.
38if queue_entries and queue_entries[0]['atomic_group']:
39    if queue_entries[0]['status'] in RUNNING_HQE_STATUSES:
40        print 'Job %d appears to have started (status: %s).' % (
41                job_id, queue_entries[0]['status'])
42        sys.exit(0)
43    # Hosts in Repairing or Repair Failed will have Queued queue entries.
44    # We shouldn't consider those queue entries as a multi-group job.
45    repair_hostnames = []
46    for queue_entry in queue_entries:
47        if queue_entry['host'] and queue_entry['host']['status']:
48            if queue_entry['host']['status'].startswith('Repair'):
49                repair_hostnames.append(queue_entry['host']['hostname'])
50        if queue_entry['status'] in ('Completed', 'Stopped'):
51            print 'This job has already finished.'
52            sys.exit(0)
53    queue_entries_with_hosts = [queue_entry for queue_entry in queue_entries
54                                if queue_entry['host']]
55    all_queue_entries_have_hosts = (len(queue_entries) ==
56                                    len(queue_entries_with_hosts))
57    if (not all_queue_entries_have_hosts and len(queue_entries) > 1 and
58        not repair_hostnames):
59        # We test repair_hostnames so that this message is not printed when
60        # the script is run on an atomic group job which has hosts assigned
61        # but is not running because too many of them are in Repairing or will
62        # never run because hosts have exited Repairing into the Repair Failed
63        # dead end.
64        print 'This script does not support multi-group atomic group jobs.'
65        print
66        print 'Jobs scheduled in that state are typically unintentional.'
67        print
68        print 'Did you perhaps schedule the job via the web frontend and ask'
69        print 'that it run on more than 1 (atomic group) of hosts via the '
70        print '"Run on any" box?  If so, always enter 1 there when scheduling'
71        print 'jobs on anything marked "(atomic group)".'
72        print
73        print len(queue_entries), 'non-started atomic group HostQueueEntries',
74        print 'found for job', job_id
75        sys.exit(1)
76    atomic_group_name = queue_entries[0]['atomic_group']['name']
77    # Get the list of labels associated with this atomic group.
78    atomic_labels = proxy.run('get_labels',
79                              atomic_group__name=atomic_group_name)
80    if len(atomic_labels) < 1:
81        print 'Job requests atomic group %s but no labels' % atomic_group_name
82        print '(and thus no hosts) are associated with that atomic group.'
83
84    job_sync_count = job['synch_count']
85    # Ugh! This is returned as a comma separated str of label names.
86    if job.get('dependencies'):
87        job_dependency_label_names = job['dependencies'].split(',')
88    else:
89        job_dependency_label_names = []
90
91    meta_host_name = queue_entries[0]['meta_host']
92    if meta_host_name:
93        meta_host = proxy.run('get_labels', atomic_group__name=meta_host_name)[0]
94    else:
95        meta_host = None
96
97    # A mapping from label name -> a list of hostnames usable for this job.
98    runnable_atomic_label_names = {}
99
100    # A mapping from label name -> a host_exclude_reasons map as described
101    # within the loop below.  Any atomic group labels in this map are not
102    # ready to run the job for the reasons contained within.
103    atomic_label_exclude_reasons = {}
104
105    for label in atomic_labels:
106        label_name = label['name']
107        if meta_host and meta_host_name != label_name:
108            print 'Cannot run on atomic label %s due to meta_host %s.' % (
109                    label_name, meta_host_name)
110            continue
111        for dep_name in job_dependency_label_names:
112            if dep_name != label_name:
113                print 'Not checking hosts in atomic label %s against' % (
114                        label_name,)
115                print 'job dependency label %s.  There may be less hosts' % (
116                        dep_name,)
117                print 'than examined below available to run this job.'
118
119        # Get the list of hosts associated with this atomic group label.
120        atomic_hosts = proxy.run('get_hosts', multiple_labels=[label_name])
121
122        # A map of hostname -> A list of reasons it can't be used.
123        host_exclude_reasons = {}
124
125        atomic_hostnames = [h['hostname'] for h in atomic_hosts]
126
127        # Map hostnames to a list of ACL names on that host.
128        acl_groups = proxy.run('get_acl_groups',
129                               hosts__hostname__in=atomic_hostnames)
130        hostname_to_acl_name_list = {}
131        for acl in acl_groups:
132            for hostname in acl['hosts']:
133                hostname_to_acl_name_list.setdefault(hostname, []).append(
134                        acl['name'])
135
136        # Exclude any hosts that ACLs deny us access to.
137        accessible_hosts = proxy.run('get_hosts', hostname__in=atomic_hostnames,
138                                     aclgroup__users__login=owner)
139        assert len(accessible_hosts) <= len(atomic_hosts)
140        if len(accessible_hosts) != len(atomic_hosts):
141            accessible_hostnames = set(h['hostname'] for h in accessible_hosts)
142            acl_excluded_hostnames = (set(atomic_hostnames) -
143                                      accessible_hostnames)
144            for hostname in acl_excluded_hostnames:
145                acls = ','.join(hostname_to_acl_name_list[hostname])
146                host_exclude_reasons.setdefault(hostname, []).append(
147                        'User %s does not have ACL access. ACLs: %s' % (
148                                owner, acls))
149
150        # Check for locked hosts.
151        locked_hosts = [h for h in atomic_hosts if h['locked']]
152        for host in locked_hosts:
153            locker = host.get('locked_by') or 'UNKNOWN'
154            msg = 'Locked by user %s on %s.  No jobs will schedule on it.' % (
155                    locker, host.get('lock_time'))
156            host_exclude_reasons.setdefault(host['hostname'], []).append(msg)
157
158        # Exclude hosts that are not Ready.
159        for host in atomic_hosts:
160            hostname = host['hostname']
161            if host['status'] != 'Ready':
162                message = 'Status is %s' % host['status']
163                if host['status'] in ('Verifying', 'Pending', 'Running'):
164                    running_hqes = proxy.run(
165                            'get_host_queue_entries', host__hostname=hostname,
166                            status__in=RUNNING_HQE_STATUSES)
167                    if not running_hqes:
168                        message += ' (unknown job)'
169                    else:
170                        message += ' (job %d)' % running_hqes[0]['job']['id']
171                host_exclude_reasons.setdefault(hostname, []).append(message)
172
173        # If we don't have enough usable hosts, this group cannot run the job.
174        usable_hostnames = [host['hostname'] for host in atomic_hosts
175                            if host['hostname'] not in host_exclude_reasons]
176        if len(usable_hostnames) < job_sync_count:
177            message = ('%d hosts are required but only %d available.' %
178                       (job_sync_count, len(usable_hostnames)))
179            atomic_label_exclude_reasons[label_name] = (message,
180                                                        host_exclude_reasons)
181        else:
182            runnable_atomic_label_names[label_name] = usable_hostnames
183
184    for label_name, reason_tuple in atomic_label_exclude_reasons.iteritems():
185        job_reason, hosts_reasons = reason_tuple
186        print 'Atomic group "%s" via label "%s" CANNOT run job %d because:' % (
187                atomic_group_name, label_name, job_id)
188        print job_reason
189        for hostname in sorted(hosts_reasons.keys()):
190            for reason in hosts_reasons[hostname]:
191                print '%s\t%s' % (hostname, reason)
192        print
193
194    for label_name, host_list in runnable_atomic_label_names.iteritems():
195        print 'Atomic group "%s" via label "%s" is READY to run job %d on:' % (
196                atomic_group_name, label_name, job_id)
197        print ', '.join(host_list)
198        print 'Is the job scheduler healthy?'
199        print
200
201    sys.exit(0)
202
203
204### Not an atomic group synchronous job:
205
206if len(args) != 2:
207    if len(queue_entries) == 1 and queue_entries[0]['host']:
208        hostname = queue_entries[0]['host']['hostname']
209    else:
210        parser.print_help()
211        print '\nERROR: A hostname associated with the job is required.'
212        sys.exit(1)
213else:
214    hostname = args[1]
215
216# host exists?
217hosts = proxy.run('get_hosts', hostname=hostname)
218if not hosts:
219    print 'No such host', hostname
220    sys.exit(1)
221host = hosts[0]
222
223# Boolean to track our findings.  We want to list all reasons it won't run,
224# not just the first.
225job_will_run = True
226
227entries_for_this_host = [entry for entry in queue_entries
228                         if entry['host']
229                         and entry['host']['hostname'] == hostname]
230host_label_names = set(host['labels'])
231eligible_metahost_entries = [entry for entry in queue_entries
232                             if entry['meta_host'] and not entry['host']
233                             and entry['meta_host'] in host_label_names
234                             and not entry['complete']]
235
236if entries_for_this_host:
237    assert len(entries_for_this_host) == 1, (
238        'Multiple entries for this job assigned to this host!')
239    entry = entries_for_this_host[0]
240    if entry['active'] or entry['complete']:
241        print ('Job already ran or is running on this host! (status: %s)' %
242               entry['full_status'])
243        sys.exit(0)
244    is_metahost = False
245else:
246    # no entry for this host -- maybe an eligible metahost entry?
247    if not eligible_metahost_entries:
248        print ("Host isn't scheduled for this job, and no eligible metahost "
249               "entry exists")
250        sys.exit(0)
251    is_metahost = True
252
253# meets atomic group requirements?
254host_labels = proxy.run('get_labels', name__in=list(host_label_names))
255host_atomic_group_labels = [label for label in host_labels
256                            if label['atomic_group']]
257host_atomic_group_name = None
258if host_atomic_group_labels:
259    atomic_groups = set()
260    for label in host_atomic_group_labels:
261        atomic_groups.add(label['atomic_group']['name'])
262    if len(atomic_groups) != 1:
263        print 'Host has more than one atomic group!'
264        print list(atomic_groups)
265        sys.exit(1)
266    host_atomic_group_label = host_atomic_group_labels[0]
267    host_atomic_group_name = host_atomic_group_label['atomic_group']['name']
268
269job_atomic_groups = set(entry['atomic_group'] for entry in queue_entries)
270assert len(job_atomic_groups) == 1, 'Job has more than one atomic group value!'
271job_atomic_group = job_atomic_groups.pop() # might be None
272job_atomic_group_name = None
273if job_atomic_group:
274    job_atomic_group_name = job_atomic_group['name']
275
276if host_atomic_group_name != job_atomic_group_name:
277    print ('Job is for atomic group %s, but host is in atomic group %s '
278           '(label %s)' %
279           (job_atomic_group_name, host_atomic_group_name,
280            host_atomic_group_label['name']))
281    job_will_run = False
282
283# host locked?
284if host['locked']:
285    print 'Host is locked by', host['locked_by'], 'no jobs will schedule on it.'
286    job_will_run = False
287
288# acl accessible?
289accessible = proxy.run('get_hosts', hostname=hostname,
290                       aclgroup__users__login=owner)
291if not accessible:
292    host_acls = ', '.join(group['name'] for group in
293                          proxy.run('get_acl_groups', hosts__hostname=hostname))
294    owner_acls = ', '.join(group['name'] for group in
295                           proxy.run('get_acl_groups', users__login=owner))
296    print 'Host not ACL-accessible to job owner', owner
297    print ' Host ACLs:', host_acls
298    print ' Owner Acls:', owner_acls
299    job_will_run = False
300
301# meets dependencies?
302job_deps_list = job['dependencies'].split(',')
303job_deps = set()
304if job_deps_list != ['']:
305    job_deps = set(job_deps_list)
306unmet = job_deps - host_label_names
307if unmet:
308    print ("Host labels (%s) don't satisfy job dependencies: %s" %
309           (', '.join(host_label_names), ', '.join(unmet)))
310    job_will_run = False
311
312# at this point, if the job is for an unassigned atomic group, things are too
313# complicated to proceed
314unassigned_atomic_group_entries = [entry for entry in queue_entries
315                                   if entry['atomic_group']
316                                   and not entry['host']]
317if unassigned_atomic_group_entries:
318    print ("Job is for an unassigned atomic group.  That's too complicated, I "
319           "can't give you any definite answers.  Sorry.")
320    sys.exit(1)
321
322# meets only_if_needed labels?
323if is_metahost:
324    metahost_names = set(entry['meta_host']
325                         for entry in eligible_metahost_entries)
326    job_deps_and_metahosts = job_deps.union(metahost_names)
327    for label in host_labels:
328        unmet_exclusive_label = (label['only_if_needed'] and
329                                 label['name'] not in job_deps_and_metahosts)
330        if unmet_exclusive_label:
331            print ('Host contains "only if needed" label %s, unused by job '
332                   'dependencies and metahosts' % label['name'])
333            job_will_run = False
334
335# host ready?
336if host['status'] != 'Ready':
337    if host['status'] == 'Pending':
338        active = proxy.run('get_host_queue_entries',
339                           host=host['id'], active=True)
340        if not active:
341            print ('Host %s seems to be in "Pending" state incorrectly; please '
342                   'report this to the Autotest team' % hostname)
343            sys.exit(1)
344    print 'Host not in "Ready" status (status="%s")' % host['status']
345    job_will_run = False
346
347if job_will_run:
348    print ("Job %s should run on host %s; if you've already waited about ten "
349           "minutes or longer, it's probably a server issue or a bug." %
350           (job_id, hostname))
351    sys.exit(1)
352else:
353    print "All of the reasons this job is not running are listed above."
354    sys.exit(0)
355