cli/contrib/why_isnt_my_job_running.py

#!/usr/bin/python

import sys, optparse, pwd
import common
from autotest_lib.cli import rpc, host
from autotest_lib.client.common_lib import host_queue_entry_states

parser = optparse.OptionParser(
    usage='Usage: %prog [options] <job id> [<hostname>]\n\n'
          'Describes why the given job on the given host has not started.')
parser.add_option('-w', '--web',
                  help='Autotest server to use (i.e. "autotest")')
options, args = parser.parse_args()

if len(args) < 1:
    parser.print_help()
    sys.exit(1)

job_id = int(args[0])

autotest_host = rpc.get_autotest_server(options.web)
proxy = rpc.afe_comm(autotest_host)

# job exists?
jobs = proxy.run('get_jobs', id=job_id)
if not jobs:
    print 'No such job', job_id
    sys.exit(1)
job = jobs[0]
owner = job['owner']

RUNNING_HQE_STATUSES = host_queue_entry_states.ACTIVE_STATUSES

# any entry eligible for this host?
queue_entries = proxy.run('get_host_queue_entries', job__id=job_id)

### Divine why an atomic group job is or is not running.
if queue_entries and queue_entries[0]['atomic_group']:
    if queue_entries[0]['status'] in RUNNING_HQE_STATUSES:
        print 'Job %d appears to have started (status: %s).' % (
                job_id, queue_entries[0]['status'])
        sys.exit(0)
    # Hosts in Repairing or Repair Failed will have Queued queue entries.
    # We shouldn't consider those queue entries as a multi-group job.
    repair_hostnames = []
    for queue_entry in queue_entries:
        if queue_entry['host'] and queue_entry['host']['status']:
            if queue_entry['host']['status'].startswith('Repair'):
                repair_hostnames.append(queue_entry['host']['hostname'])
        if queue_entry['status'] in ('Completed', 'Stopped'):
            print 'This job has already finished.'
            sys.exit(0)
    queue_entries_with_hosts = [queue_entry for queue_entry in queue_entries
                                if queue_entry['host']]
    all_queue_entries_have_hosts = (len(queue_entries) ==
                                    len(queue_entries_with_hosts))
    if (not all_queue_entries_have_hosts and len(queue_entries) > 1 and
        not repair_hostnames):
        # We test repair_hostnames so that this message is not printed when
        # the script is run on an atomic group job which has hosts assigned
        # but is not running because too many of them are in Repairing or will
        # never run because hosts have exited Repairing into the Repair Failed
        # dead end.
        print 'This script does not support multi-group atomic group jobs.'
        print
        print 'Jobs scheduled in that state are typically unintentional.'
        print
        print 'Did you perhaps schedule the job via the web frontend and ask'
        print 'that it run on more than 1 (atomic group) of hosts via the '
        print '"Run on any" box?  If so, always enter 1 there when scheduling'
        print 'jobs on anything marked "(atomic group)".'
        print
        print len(queue_entries), 'non-started atomic group HostQueueEntries',
        print 'found for job', job_id
        sys.exit(1)
    atomic_group_name = queue_entries[0]['atomic_group']['name']
    # Get the list of labels associated with this atomic group.
    atomic_labels = proxy.run('get_labels',
                              atomic_group__name=atomic_group_name)
    if len(atomic_labels) < 1:
        print 'Job requests atomic group %s but no labels' % atomic_group_name
        print '(and thus no hosts) are associated with that atomic group.'

    job_sync_count = job['synch_count']
    # Ugh! This is returned as a comma separated str of label names.
    if job.get('dependencies'):
        job_dependency_label_names = job['dependencies'].split(',')
    else:
        job_dependency_label_names = []

    meta_host_name = queue_entries[0]['meta_host']
    if meta_host_name:
        meta_host = proxy.run('get_labels', atomic_group__name=meta_host_name)[0]
    else:
        meta_host = None

    # A mapping from label name -> a list of hostnames usable for this job.
    runnable_atomic_label_names = {}

    # A mapping from label name -> a host_exclude_reasons map as described
    # within the loop below.  Any atomic group labels in this map are not
    # ready to run the job for the reasons contained within.
    atomic_label_exclude_reasons = {}

    for label in atomic_labels:
        label_name = label['name']
        if meta_host and meta_host_name != label_name:
            print 'Cannot run on atomic label %s due to meta_host %s.' % (
                    label_name, meta_host_name)
            continue
        for dep_name in job_dependency_label_names:
            if dep_name != label_name:
                print 'Not checking hosts in atomic label %s against' % (
                        label_name,)
                print 'job dependency label %s.  There may be less hosts' % (
                        dep_name,)
                print 'than examined below available to run this job.'

        # Get the list of hosts associated with this atomic group label.
        atomic_hosts = proxy.run('get_hosts', multiple_labels=[label_name])

        # A map of hostname -> A list of reasons it can't be used.
        host_exclude_reasons = {}

        atomic_hostnames = [h['hostname'] for h in atomic_hosts]

        # Map hostnames to a list of ACL names on that host.
        acl_groups = proxy.run('get_acl_groups',
                               hosts__hostname__in=atomic_hostnames)
        hostname_to_acl_name_list = {}
        for acl in acl_groups:
            for hostname in acl['hosts']:
                hostname_to_acl_name_list.setdefault(hostname, []).append(
                        acl['name'])

        # Exclude any hosts that ACLs deny us access to.
        accessible_hosts = proxy.run('get_hosts', hostname__in=atomic_hostnames,
                                     aclgroup__users__login=owner)
        assert len(accessible_hosts) <= len(atomic_hosts)
        if len(accessible_hosts) != len(atomic_hosts):
            accessible_hostnames = set(h['hostname'] for h in accessible_hosts)
            acl_excluded_hostnames = (set(atomic_hostnames) -
                                      accessible_hostnames)
            for hostname in acl_excluded_hostnames:
                acls = ','.join(hostname_to_acl_name_list[hostname])
                host_exclude_reasons.setdefault(hostname, []).append(
                        'User %s does not have ACL access. ACLs: %s' % (
                                owner, acls))

        # Check for locked hosts.
        locked_hosts = [h for h in atomic_hosts if h['locked']]
        for host in locked_hosts:
            locker = host.get('locked_by') or 'UNKNOWN'
            msg = 'Locked by user %s on %s.  No jobs will schedule on it.' % (
                    locker, host.get('lock_time'))
            host_exclude_reasons.setdefault(host['hostname'], []).append(msg)

        # Exclude hosts that are not Ready.
        for host in atomic_hosts:
            hostname = host['hostname']
            if host['status'] != 'Ready':
                message = 'Status is %s' % host['status']
                if host['status'] in ('Verifying', 'Pending', 'Running'):
                    running_hqes = proxy.run(
                            'get_host_queue_entries', host__hostname=hostname,
                            status__in=RUNNING_HQE_STATUSES)
                    if not running_hqes:
                        message += ' (unknown job)'
                    else:
                        message += ' (job %d)' % running_hqes[0]['job']['id']
                host_exclude_reasons.setdefault(hostname, []).append(message)

        # If we don't have enough usable hosts, this group cannot run the job.
        usable_hostnames = [host['hostname'] for host in atomic_hosts
                            if host['hostname'] not in host_exclude_reasons]
        if len(usable_hostnames) < job_sync_count:
            message = ('%d hosts are required but only %d available.' %
                       (job_sync_count, len(usable_hostnames)))
            atomic_label_exclude_reasons[label_name] = (message,
                                                        host_exclude_reasons)
        else:
            runnable_atomic_label_names[label_name] = usable_hostnames

    for label_name, reason_tuple in atomic_label_exclude_reasons.iteritems():
        job_reason, hosts_reasons = reason_tuple
        print 'Atomic group "%s" via label "%s" CANNOT run job %d because:' % (
                atomic_group_name, label_name, job_id)
        print job_reason
        for hostname in sorted(hosts_reasons.keys()):
            for reason in hosts_reasons[hostname]:
                print '%s\t%s' % (hostname, reason)
        print

    for label_name, host_list in runnable_atomic_label_names.iteritems():
        print 'Atomic group "%s" via label "%s" is READY to run job %d on:' % (
                atomic_group_name, label_name, job_id)
        print ', '.join(host_list)
        print 'Is the job scheduler healthy?'
        print

    sys.exit(0)


### Not an atomic group synchronous job:

if len(args) != 2:
    if len(queue_entries) == 1 and queue_entries[0]['host']:
        hostname = queue_entries[0]['host']['hostname']
    else:
        parser.print_help()
        print '\nERROR: A hostname associated with the job is required.'
        sys.exit(1)
else:
    hostname = args[1]

# host exists?
hosts = proxy.run('get_hosts', hostname=hostname)
if not hosts:
    print 'No such host', hostname
    sys.exit(1)
host = hosts[0]

# Boolean to track our findings.  We want to list all reasons it won't run,
# not just the first.
job_will_run = True

entries_for_this_host = [entry for entry in queue_entries
                         if entry['host']
                         and entry['host']['hostname'] == hostname]
host_label_names = set(host['labels'])
eligible_metahost_entries = [entry for entry in queue_entries
                             if entry['meta_host'] and not entry['host']
                             and entry['meta_host'] in host_label_names
                             and not entry['complete']]

if entries_for_this_host:
    assert len(entries_for_this_host) == 1, (
        'Multiple entries for this job assigned to this host!')
    entry = entries_for_this_host[0]
    if entry['active'] or entry['complete']:
        print ('Job already ran or is running on this host! (status: %s)' %
               entry['full_status'])
        sys.exit(0)
    is_metahost = False
else:
    # no entry for this host -- maybe an eligible metahost entry?
    if not eligible_metahost_entries:
        print ("Host isn't scheduled for this job, and no eligible metahost "
               "entry exists")
        sys.exit(0)
    is_metahost = True

# meets atomic group requirements?
host_labels = proxy.run('get_labels', name__in=list(host_label_names))
host_atomic_group_labels = [label for label in host_labels
                            if label['atomic_group']]
host_atomic_group_name = None
if host_atomic_group_labels:
    atomic_groups = set()
    for label in host_atomic_group_labels:
        atomic_groups.add(label['atomic_group']['name'])
    if len(atomic_groups) != 1:
        print 'Host has more than one atomic group!'
        print list(atomic_groups)
        sys.exit(1)
    host_atomic_group_label = host_atomic_group_labels[0]
    host_atomic_group_name = host_atomic_group_label['atomic_group']['name']

job_atomic_groups = set(entry['atomic_group'] for entry in queue_entries)
assert len(job_atomic_groups) == 1, 'Job has more than one atomic group value!'
job_atomic_group = job_atomic_groups.pop() # might be None
job_atomic_group_name = None
if job_atomic_group:
    job_atomic_group_name = job_atomic_group['name']

if host_atomic_group_name != job_atomic_group_name:
    print ('Job is for atomic group %s, but host is in atomic group %s '
           '(label %s)' %
           (job_atomic_group_name, host_atomic_group_name,
            host_atomic_group_label['name']))
    job_will_run = False

# host locked?
if host['locked']:
    print 'Host is locked by', host['locked_by'], 'no jobs will schedule on it.'
    job_will_run = False

# acl accessible?
accessible = proxy.run('get_hosts', hostname=hostname,
                       aclgroup__users__login=owner)
if not accessible:
    host_acls = ', '.join(group['name'] for group in
                          proxy.run('get_acl_groups', hosts__hostname=hostname))
    owner_acls = ', '.join(group['name'] for group in
                           proxy.run('get_acl_groups', users__login=owner))
    print 'Host not ACL-accessible to job owner', owner
    print ' Host ACLs:', host_acls
    print ' Owner Acls:', owner_acls
    job_will_run = False

# meets dependencies?
job_deps_list = job['dependencies'].split(',')
job_deps = set()
if job_deps_list != ['']:
    job_deps = set(job_deps_list)
unmet = job_deps - host_label_names
if unmet:
    print ("Host labels (%s) don't satisfy job dependencies: %s" %
           (', '.join(host_label_names), ', '.join(unmet)))
    job_will_run = False

# at this point, if the job is for an unassigned atomic group, things are too
# complicated to proceed
unassigned_atomic_group_entries = [entry for entry in queue_entries
                                   if entry['atomic_group']
                                   and not entry['host']]
if unassigned_atomic_group_entries:
    print ("Job is for an unassigned atomic group.  That's too complicated, I "
           "can't give you any definite answers.  Sorry.")
    sys.exit(1)

# meets only_if_needed labels?
if is_metahost:
    metahost_names = set(entry['meta_host']
                         for entry in eligible_metahost_entries)
    job_deps_and_metahosts = job_deps.union(metahost_names)
    for label in host_labels:
        unmet_exclusive_label = (label['only_if_needed'] and
                                 label['name'] not in job_deps_and_metahosts)
        if unmet_exclusive_label:
            print ('Host contains "only if needed" label %s, unused by job '
                   'dependencies and metahosts' % label['name'])
            job_will_run = False

# host ready?
if host['status'] != 'Ready':
    if host['status'] == 'Pending':
        active = proxy.run('get_host_queue_entries',
                           host=host['id'], active=True)
        if not active:
            print ('Host %s seems to be in "Pending" state incorrectly; please '
                   'report this to the Autotest team' % hostname)
            sys.exit(1)
    print 'Host not in "Ready" status (status="%s")' % host['status']
    job_will_run = False

if job_will_run:
    print ("Job %s should run on host %s; if you've already waited about ten "
           "minutes or longer, it's probably a server issue or a bug." %
           (job_id, hostname))
    sys.exit(1)
else:
    print "All of the reasons this job is not running are listed above."
    sys.exit(0)