1#!/usr/bin/python 2 3import sys, optparse, pwd 4import common 5from autotest_lib.cli import rpc, host 6from autotest_lib.client.common_lib import host_queue_entry_states 7 8parser = optparse.OptionParser( 9 usage='Usage: %prog [options] <job id> [<hostname>]\n\n' 10 'Describes why the given job on the given host has not started.') 11parser.add_option('-w', '--web', 12 help='Autotest server to use (i.e. "autotest")') 13options, args = parser.parse_args() 14 15if len(args) < 1: 16 parser.print_help() 17 sys.exit(1) 18 19job_id = int(args[0]) 20 21autotest_host = rpc.get_autotest_server(options.web) 22proxy = rpc.afe_comm(autotest_host) 23 24# job exists? 25jobs = proxy.run('get_jobs', id=job_id) 26if not jobs: 27 print 'No such job', job_id 28 sys.exit(1) 29job = jobs[0] 30owner = job['owner'] 31 32RUNNING_HQE_STATUSES = host_queue_entry_states.ACTIVE_STATUSES 33 34# any entry eligible for this host? 35queue_entries = proxy.run('get_host_queue_entries', job__id=job_id) 36 37### Divine why an atomic group job is or is not running. 38if queue_entries and queue_entries[0]['atomic_group']: 39 if queue_entries[0]['status'] in RUNNING_HQE_STATUSES: 40 print 'Job %d appears to have started (status: %s).' % ( 41 job_id, queue_entries[0]['status']) 42 sys.exit(0) 43 # Hosts in Repairing or Repair Failed will have Queued queue entries. 44 # We shouldn't consider those queue entries as a multi-group job. 45 repair_hostnames = [] 46 for queue_entry in queue_entries: 47 if queue_entry['host'] and queue_entry['host']['status']: 48 if queue_entry['host']['status'].startswith('Repair'): 49 repair_hostnames.append(queue_entry['host']['hostname']) 50 if queue_entry['status'] in ('Completed', 'Stopped'): 51 print 'This job has already finished.' 52 sys.exit(0) 53 queue_entries_with_hosts = [queue_entry for queue_entry in queue_entries 54 if queue_entry['host']] 55 all_queue_entries_have_hosts = (len(queue_entries) == 56 len(queue_entries_with_hosts)) 57 if (not all_queue_entries_have_hosts and len(queue_entries) > 1 and 58 not repair_hostnames): 59 # We test repair_hostnames so that this message is not printed when 60 # the script is run on an atomic group job which has hosts assigned 61 # but is not running because too many of them are in Repairing or will 62 # never run because hosts have exited Repairing into the Repair Failed 63 # dead end. 64 print 'This script does not support multi-group atomic group jobs.' 65 print 66 print 'Jobs scheduled in that state are typically unintentional.' 67 print 68 print 'Did you perhaps schedule the job via the web frontend and ask' 69 print 'that it run on more than 1 (atomic group) of hosts via the ' 70 print '"Run on any" box? If so, always enter 1 there when scheduling' 71 print 'jobs on anything marked "(atomic group)".' 72 print 73 print len(queue_entries), 'non-started atomic group HostQueueEntries', 74 print 'found for job', job_id 75 sys.exit(1) 76 atomic_group_name = queue_entries[0]['atomic_group']['name'] 77 # Get the list of labels associated with this atomic group. 78 atomic_labels = proxy.run('get_labels', 79 atomic_group__name=atomic_group_name) 80 if len(atomic_labels) < 1: 81 print 'Job requests atomic group %s but no labels' % atomic_group_name 82 print '(and thus no hosts) are associated with that atomic group.' 83 84 job_sync_count = job['synch_count'] 85 # Ugh! This is returned as a comma separated str of label names. 86 if job.get('dependencies'): 87 job_dependency_label_names = job['dependencies'].split(',') 88 else: 89 job_dependency_label_names = [] 90 91 meta_host_name = queue_entries[0]['meta_host'] 92 if meta_host_name: 93 meta_host = proxy.run('get_labels', atomic_group__name=meta_host_name)[0] 94 else: 95 meta_host = None 96 97 # A mapping from label name -> a list of hostnames usable for this job. 98 runnable_atomic_label_names = {} 99 100 # A mapping from label name -> a host_exclude_reasons map as described 101 # within the loop below. Any atomic group labels in this map are not 102 # ready to run the job for the reasons contained within. 103 atomic_label_exclude_reasons = {} 104 105 for label in atomic_labels: 106 label_name = label['name'] 107 if meta_host and meta_host_name != label_name: 108 print 'Cannot run on atomic label %s due to meta_host %s.' % ( 109 label_name, meta_host_name) 110 continue 111 for dep_name in job_dependency_label_names: 112 if dep_name != label_name: 113 print 'Not checking hosts in atomic label %s against' % ( 114 label_name,) 115 print 'job dependency label %s. There may be less hosts' % ( 116 dep_name,) 117 print 'than examined below available to run this job.' 118 119 # Get the list of hosts associated with this atomic group label. 120 atomic_hosts = proxy.run('get_hosts', multiple_labels=[label_name]) 121 122 # A map of hostname -> A list of reasons it can't be used. 123 host_exclude_reasons = {} 124 125 atomic_hostnames = [h['hostname'] for h in atomic_hosts] 126 127 # Map hostnames to a list of ACL names on that host. 128 acl_groups = proxy.run('get_acl_groups', 129 hosts__hostname__in=atomic_hostnames) 130 hostname_to_acl_name_list = {} 131 for acl in acl_groups: 132 for hostname in acl['hosts']: 133 hostname_to_acl_name_list.setdefault(hostname, []).append( 134 acl['name']) 135 136 # Exclude any hosts that ACLs deny us access to. 137 accessible_hosts = proxy.run('get_hosts', hostname__in=atomic_hostnames, 138 aclgroup__users__login=owner) 139 assert len(accessible_hosts) <= len(atomic_hosts) 140 if len(accessible_hosts) != len(atomic_hosts): 141 accessible_hostnames = set(h['hostname'] for h in accessible_hosts) 142 acl_excluded_hostnames = (set(atomic_hostnames) - 143 accessible_hostnames) 144 for hostname in acl_excluded_hostnames: 145 acls = ','.join(hostname_to_acl_name_list[hostname]) 146 host_exclude_reasons.setdefault(hostname, []).append( 147 'User %s does not have ACL access. ACLs: %s' % ( 148 owner, acls)) 149 150 # Check for locked hosts. 151 locked_hosts = [h for h in atomic_hosts if h['locked']] 152 for host in locked_hosts: 153 locker = host.get('locked_by') or 'UNKNOWN' 154 msg = 'Locked by user %s on %s. No jobs will schedule on it.' % ( 155 locker, host.get('lock_time')) 156 host_exclude_reasons.setdefault(host['hostname'], []).append(msg) 157 158 # Exclude hosts that are not Ready. 159 for host in atomic_hosts: 160 hostname = host['hostname'] 161 if host['status'] != 'Ready': 162 message = 'Status is %s' % host['status'] 163 if host['status'] in ('Verifying', 'Pending', 'Running'): 164 running_hqes = proxy.run( 165 'get_host_queue_entries', host__hostname=hostname, 166 status__in=RUNNING_HQE_STATUSES) 167 if not running_hqes: 168 message += ' (unknown job)' 169 else: 170 message += ' (job %d)' % running_hqes[0]['job']['id'] 171 host_exclude_reasons.setdefault(hostname, []).append(message) 172 173 # If we don't have enough usable hosts, this group cannot run the job. 174 usable_hostnames = [host['hostname'] for host in atomic_hosts 175 if host['hostname'] not in host_exclude_reasons] 176 if len(usable_hostnames) < job_sync_count: 177 message = ('%d hosts are required but only %d available.' % 178 (job_sync_count, len(usable_hostnames))) 179 atomic_label_exclude_reasons[label_name] = (message, 180 host_exclude_reasons) 181 else: 182 runnable_atomic_label_names[label_name] = usable_hostnames 183 184 for label_name, reason_tuple in atomic_label_exclude_reasons.iteritems(): 185 job_reason, hosts_reasons = reason_tuple 186 print 'Atomic group "%s" via label "%s" CANNOT run job %d because:' % ( 187 atomic_group_name, label_name, job_id) 188 print job_reason 189 for hostname in sorted(hosts_reasons.keys()): 190 for reason in hosts_reasons[hostname]: 191 print '%s\t%s' % (hostname, reason) 192 print 193 194 for label_name, host_list in runnable_atomic_label_names.iteritems(): 195 print 'Atomic group "%s" via label "%s" is READY to run job %d on:' % ( 196 atomic_group_name, label_name, job_id) 197 print ', '.join(host_list) 198 print 'Is the job scheduler healthy?' 199 print 200 201 sys.exit(0) 202 203 204### Not an atomic group synchronous job: 205 206if len(args) != 2: 207 if len(queue_entries) == 1 and queue_entries[0]['host']: 208 hostname = queue_entries[0]['host']['hostname'] 209 else: 210 parser.print_help() 211 print '\nERROR: A hostname associated with the job is required.' 212 sys.exit(1) 213else: 214 hostname = args[1] 215 216# host exists? 217hosts = proxy.run('get_hosts', hostname=hostname) 218if not hosts: 219 print 'No such host', hostname 220 sys.exit(1) 221host = hosts[0] 222 223# Boolean to track our findings. We want to list all reasons it won't run, 224# not just the first. 225job_will_run = True 226 227entries_for_this_host = [entry for entry in queue_entries 228 if entry['host'] 229 and entry['host']['hostname'] == hostname] 230host_label_names = set(host['labels']) 231eligible_metahost_entries = [entry for entry in queue_entries 232 if entry['meta_host'] and not entry['host'] 233 and entry['meta_host'] in host_label_names 234 and not entry['complete']] 235 236if entries_for_this_host: 237 assert len(entries_for_this_host) == 1, ( 238 'Multiple entries for this job assigned to this host!') 239 entry = entries_for_this_host[0] 240 if entry['active'] or entry['complete']: 241 print ('Job already ran or is running on this host! (status: %s)' % 242 entry['full_status']) 243 sys.exit(0) 244 is_metahost = False 245else: 246 # no entry for this host -- maybe an eligible metahost entry? 247 if not eligible_metahost_entries: 248 print ("Host isn't scheduled for this job, and no eligible metahost " 249 "entry exists") 250 sys.exit(0) 251 is_metahost = True 252 253# meets atomic group requirements? 254host_labels = proxy.run('get_labels', name__in=list(host_label_names)) 255host_atomic_group_labels = [label for label in host_labels 256 if label['atomic_group']] 257host_atomic_group_name = None 258if host_atomic_group_labels: 259 atomic_groups = set() 260 for label in host_atomic_group_labels: 261 atomic_groups.add(label['atomic_group']['name']) 262 if len(atomic_groups) != 1: 263 print 'Host has more than one atomic group!' 264 print list(atomic_groups) 265 sys.exit(1) 266 host_atomic_group_label = host_atomic_group_labels[0] 267 host_atomic_group_name = host_atomic_group_label['atomic_group']['name'] 268 269job_atomic_groups = set(entry['atomic_group'] for entry in queue_entries) 270assert len(job_atomic_groups) == 1, 'Job has more than one atomic group value!' 271job_atomic_group = job_atomic_groups.pop() # might be None 272job_atomic_group_name = None 273if job_atomic_group: 274 job_atomic_group_name = job_atomic_group['name'] 275 276if host_atomic_group_name != job_atomic_group_name: 277 print ('Job is for atomic group %s, but host is in atomic group %s ' 278 '(label %s)' % 279 (job_atomic_group_name, host_atomic_group_name, 280 host_atomic_group_label['name'])) 281 job_will_run = False 282 283# host locked? 284if host['locked']: 285 print 'Host is locked by', host['locked_by'], 'no jobs will schedule on it.' 286 job_will_run = False 287 288# acl accessible? 289accessible = proxy.run('get_hosts', hostname=hostname, 290 aclgroup__users__login=owner) 291if not accessible: 292 host_acls = ', '.join(group['name'] for group in 293 proxy.run('get_acl_groups', hosts__hostname=hostname)) 294 owner_acls = ', '.join(group['name'] for group in 295 proxy.run('get_acl_groups', users__login=owner)) 296 print 'Host not ACL-accessible to job owner', owner 297 print ' Host ACLs:', host_acls 298 print ' Owner Acls:', owner_acls 299 job_will_run = False 300 301# meets dependencies? 302job_deps_list = job['dependencies'].split(',') 303job_deps = set() 304if job_deps_list != ['']: 305 job_deps = set(job_deps_list) 306unmet = job_deps - host_label_names 307if unmet: 308 print ("Host labels (%s) don't satisfy job dependencies: %s" % 309 (', '.join(host_label_names), ', '.join(unmet))) 310 job_will_run = False 311 312# at this point, if the job is for an unassigned atomic group, things are too 313# complicated to proceed 314unassigned_atomic_group_entries = [entry for entry in queue_entries 315 if entry['atomic_group'] 316 and not entry['host']] 317if unassigned_atomic_group_entries: 318 print ("Job is for an unassigned atomic group. That's too complicated, I " 319 "can't give you any definite answers. Sorry.") 320 sys.exit(1) 321 322# meets only_if_needed labels? 323if is_metahost: 324 metahost_names = set(entry['meta_host'] 325 for entry in eligible_metahost_entries) 326 job_deps_and_metahosts = job_deps.union(metahost_names) 327 for label in host_labels: 328 unmet_exclusive_label = (label['only_if_needed'] and 329 label['name'] not in job_deps_and_metahosts) 330 if unmet_exclusive_label: 331 print ('Host contains "only if needed" label %s, unused by job ' 332 'dependencies and metahosts' % label['name']) 333 job_will_run = False 334 335# host ready? 336if host['status'] != 'Ready': 337 if host['status'] == 'Pending': 338 active = proxy.run('get_host_queue_entries', 339 host=host['id'], active=True) 340 if not active: 341 print ('Host %s seems to be in "Pending" state incorrectly; please ' 342 'report this to the Autotest team' % hostname) 343 sys.exit(1) 344 print 'Host not in "Ready" status (status="%s")' % host['status'] 345 job_will_run = False 346 347if job_will_run: 348 print ("Job %s should run on host %s; if you've already waited about ten " 349 "minutes or longer, it's probably a server issue or a bug." % 350 (job_id, hostname)) 351 sys.exit(1) 352else: 353 print "All of the reasons this job is not running are listed above." 354 sys.exit(0) 355