1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from distutils import version 6import cStringIO 7import HTMLParser 8import httplib 9import json 10import logging 11import multiprocessing 12import os 13import re 14import socket 15import time 16import urllib2 17import urlparse 18 19from autotest_lib.client.bin import utils as bin_utils 20from autotest_lib.client.common_lib import android_utils 21from autotest_lib.client.common_lib import error 22from autotest_lib.client.common_lib import global_config 23from autotest_lib.client.common_lib import utils 24from autotest_lib.client.common_lib.cros import retry 25from autotest_lib.server import utils as server_utils 26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107 27 28try: 29 from chromite.lib import metrics 30except ImportError: 31 metrics = utils.metrics_mock 32 33 34CONFIG = global_config.global_config 35# This file is generated at build time and specifies, per suite and per test, 36# the DEPENDENCIES list specified in each control file. It's a dict of dicts: 37# {'bvt': {'/path/to/autotest/control/site_tests/test1/control': ['dep1']} 38# 'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']} 39# 'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'], 40# '/path/to/autotest/control/site_tests/test3/control': ['dep3']} 41# } 42DEPENDENCIES_FILE = 'test_suites/dependency_info' 43# Number of seconds for caller to poll devserver's is_staged call to check if 44# artifacts are staged. 45_ARTIFACT_STAGE_POLLING_INTERVAL = 5 46# Artifacts that should be staged when client calls devserver RPC to stage an 47# image. 48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful' 49# Artifacts that should be staged when client calls devserver RPC to stage an 50# image with autotest artifact. 51_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,' 52 'control_files,stateful,' 53 'autotest_packages') 54# Artifacts that should be staged when client calls devserver RPC to stage an 55# Android build. 56_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions') 57SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value( 58 'CROS', 'skip_devserver_health_check', type=bool) 59# Number of seconds for the call to get devserver load to time out. 60TIMEOUT_GET_DEVSERVER_LOAD = 2.0 61 62# Android artifact path in devserver 63ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value( 64 'CROS', 'android_build_name_pattern', type=str).replace('\\', '') 65 66# Return value from a devserver RPC indicating the call succeeded. 67SUCCESS = 'Success' 68 69# The timeout minutes for a given devserver ssh call. 70DEVSERVER_SSH_TIMEOUT_MINS = 1 71 72# Error message for invalid devserver response. 73ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error' 74 75# Error message for devserver call timedout. 76ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout' 77 78# The timeout minutes for waiting a devserver staging. 79DEVSERVER_IS_STAGING_RETRY_MIN = 100 80 81# The timeout minutes for waiting a DUT auto-update finished. 82DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100 83 84# The total times of devserver triggering CrOS auto-update. 85AU_RETRY_LIMIT = 2 86 87# Number of seconds for caller to poll devserver's get_au_status call to 88# check if cros auto-update is finished. 89CROS_AU_POLLING_INTERVAL = 10 90 91# Number of seconds for intervals between retrying auto-update calls. 92CROS_AU_RETRY_INTERVAL = 20 93 94# The file name for auto-update logs. 95CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log' 96 97# Provision error patterns. 98# People who see this should know that they shouldn't change these 99# classification strings. These strings are used for monitoring provision 100# failures. Any changes may mess up the stats. 101_EXCEPTION_PATTERNS = [ 102 # Raised when devserver portfile does not exist on host. 103 (r".*Devserver portfile does not exist!.*$", 104 '(1) Devserver portfile does not exist on host'), 105 # Raised when devserver cannot copy packages to host. 106 (r".*Could not copy .* to device.*$", 107 '(2) Cannot copy packages to host'), 108 # Raised when devserver fails to run specific commands on host. 109 (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$", 110 '(3) Fail to run specific command on host'), 111 # Raised when new build fails to boot on the host. 112 (r'.*RootfsUpdateError: Build .* failed to boot on.*$', 113 '(4) Build failed to boot on host'), 114 # Raised when the auto-update process is timed out. 115 (r'.*The CrOS auto-update process is timed out, ' 116 'thus will be terminated.*$', 117 '(5) Auto-update is timed out'), 118 # Raised when the host is not pingable. 119 (r".*DeviceNotPingableError.*$", 120 '(6) Host is not pingable during auto-update'), 121 # Raised when hosts have unexpected status after rootfs update. 122 (r'.*Update failed with unexpected update status: ' 123 'UPDATE_STATUS_IDLE.*$', 124 '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs ' 125 'update'), 126 # Raised when devserver returns non-json response to shard/drone. 127 (r'.*No JSON object could be decoded.*$', 128 '(8) Devserver returned non-json object'), 129 # Raised when devserver loses host's ssh connection 130 (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$', 131 "(9) Devserver lost host's ssh connection"), 132 # Raised when error happens in writing files to host 133 (r'.*Write failed\: Broken pipe.*$', 134 "(10) Broken pipe while writing or connecting to host")] 135 136PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value( 137 'CROS', 'prefer_local_devserver', type=bool, default=False) 138 139ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value( 140 'CROS', 'enable_ssh_connection_for_devserver', type=bool, 141 default=False) 142 143# Directory to save auto-update logs 144AUTO_UPDATE_LOG_DIR = 'autoupdate_logs' 145 146DEFAULT_SUBNET_MASKBIT = 19 147 148 149class DevServerException(Exception): 150 """Raised when the dev server returns a non-200 HTTP response.""" 151 pass 152 153class RetryableProvisionException(DevServerException): 154 """Raised when provision fails due to a retryable reason.""" 155 pass 156 157class DevServerOverloadException(Exception): 158 """Raised when the dev server returns a 502 HTTP response.""" 159 pass 160 161class DevServerFailToLocateException(Exception): 162 """Raised when fail to locate any devserver.""" 163 pass 164 165class MarkupStripper(HTMLParser.HTMLParser): 166 """HTML parser that strips HTML tags, coded characters like & 167 168 Works by, basically, not doing anything for any tags, and only recording 169 the content of text nodes in an internal data structure. 170 """ 171 def __init__(self): 172 self.reset() 173 self.fed = [] 174 175 176 def handle_data(self, d): 177 """Consume content of text nodes, store it away.""" 178 self.fed.append(d) 179 180 181 def get_data(self): 182 """Concatenate and return all stored data.""" 183 return ''.join(self.fed) 184 185 186def _strip_http_message(message): 187 """Strip the HTTP marker from the an HTTP message. 188 189 @param message: A string returned by an HTTP call. 190 191 @return: A string with HTTP marker being stripped. 192 """ 193 strip = MarkupStripper() 194 try: 195 strip.feed(message.decode('utf_32')) 196 except UnicodeDecodeError: 197 strip.feed(message) 198 return strip.get_data() 199 200 201def _get_image_storage_server(): 202 return CONFIG.get_config_value('CROS', 'image_storage_server', type=str) 203 204 205def _get_canary_channel_server(): 206 """ 207 Get the url of the canary-channel server, 208 eg: gsutil://chromeos-releases/canary-channel/<board>/<release> 209 210 @return: The url to the canary channel server. 211 """ 212 return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str) 213 214 215def _get_storage_server_for_artifacts(artifacts=None): 216 """Gets the appropriate storage server for the given artifacts. 217 218 @param artifacts: A list of artifacts we need to stage. 219 @return: The address of the storage server that has these artifacts. 220 The default image storage server if no artifacts are specified. 221 """ 222 factory_artifact = global_config.global_config.get_config_value( 223 'CROS', 'factory_artifact', type=str, default='') 224 if artifacts and factory_artifact and factory_artifact in artifacts: 225 return _get_canary_channel_server() 226 return _get_image_storage_server() 227 228 229def _reverse_lookup_from_config(address): 230 """Look up hostname for the given IP address. 231 232 This uses the hostname-address map from the config file. 233 234 If multiple hostnames map to the same IP address, the first one 235 defined in the configuration file takes precedence. 236 237 @param address: IP address string 238 @returns: hostname string, or original input if not found 239 """ 240 for hostname, addr in _get_hostname_addr_map().iteritems(): 241 if addr == address: 242 return hostname 243 return address 244 245 246def _get_hostname_addr_map(): 247 """Get hostname address mapping from config. 248 249 @return: dict mapping server hostnames to addresses 250 """ 251 return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP') 252 253 254def _get_dev_server_list(): 255 return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[]) 256 257 258def _get_crash_server_list(): 259 return CONFIG.get_config_value('CROS', 'crash_server', type=list, 260 default=[]) 261 262 263def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN, 264 exception_to_raise=DevServerException): 265 """A decorator to use with remote devserver calls. 266 267 This decorator converts urllib2.HTTPErrors into DevServerExceptions 268 with any embedded error info converted into plain text. The method 269 retries on urllib2.URLError or error.CmdError to avoid devserver flakiness. 270 """ 271 #pylint: disable=C0111 272 273 def inner_decorator(method): 274 label = method.__name__ if hasattr(method, '__name__') else None 275 def metrics_wrapper(*args, **kwargs): 276 @retry.retry((urllib2.URLError, error.CmdError, 277 DevServerOverloadException), 278 timeout_min=timeout_min, 279 exception_to_raise=exception_to_raise, 280 label=label) 281 def wrapper(): 282 """This wrapper actually catches the HTTPError.""" 283 try: 284 return method(*args, **kwargs) 285 except urllib2.HTTPError as e: 286 error_markup = e.read() 287 raise DevServerException(_strip_http_message(error_markup)) 288 289 try: 290 return wrapper() 291 except Exception as e: 292 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e): 293 dev_server = None 294 if args and isinstance(args[0], DevServer): 295 dev_server = args[0].hostname 296 elif 'devserver' in kwargs: 297 dev_server = get_hostname(kwargs['devserver']) 298 299 logging.debug('RPC call %s has timed out on devserver %s.', 300 label, dev_server) 301 c = metrics.Counter( 302 'chromeos/autotest/devserver/call_timeout') 303 c.increment(fields={'dev_server': dev_server, 304 'healthy': label}) 305 306 raise 307 308 return metrics_wrapper 309 310 return inner_decorator 311 312 313def get_hostname(url): 314 """Get the hostname portion of a URL 315 316 schema://hostname:port/path 317 318 @param url: a Url string 319 @return: a hostname string 320 """ 321 return urlparse.urlparse(url).hostname 322 323 324class DevServer(object): 325 """Base class for all DevServer-like server stubs. 326 327 This is the base class for interacting with all Dev Server-like servers. 328 A caller should instantiate a sub-class of DevServer with: 329 330 host = SubClassServer.resolve(build) 331 server = SubClassServer(host) 332 """ 333 _MIN_FREE_DISK_SPACE_GB = 20 334 _MAX_APACHE_CLIENT_COUNT = 75 335 # Threshold for the CPU load percentage for a devserver to be selected. 336 MAX_CPU_LOAD = 80.0 337 # Threshold for the network IO, set to 80MB/s 338 MAX_NETWORK_IO = 1024 * 1024 * 80 339 DISK_IO = 'disk_total_bytes_per_second' 340 NETWORK_IO = 'network_total_bytes_per_second' 341 CPU_LOAD = 'cpu_percent' 342 FREE_DISK = 'free_disk' 343 AU_PROCESS = 'au_process_count' 344 STAGING_THREAD_COUNT = 'staging_thread_count' 345 APACHE_CLIENT_COUNT = 'apache_client_count' 346 347 348 def __init__(self, devserver): 349 self._devserver = devserver 350 351 352 def url(self): 353 """Returns the url for this devserver.""" 354 return self._devserver 355 356 357 @property 358 def hostname(self): 359 """Return devserver hostname parsed from the devserver URL. 360 361 Note that this is likely parsed from the devserver URL from 362 shadow_config.ini, meaning that the "hostname" part of the 363 devserver URL is actually an IP address. 364 365 @return hostname string 366 """ 367 return get_hostname(self.url()) 368 369 370 @property 371 def resolved_hostname(self): 372 """Return devserver hostname, resolved from its IP address. 373 374 Unlike the hostname property, this property attempts to look up 375 the proper hostname from the devserver IP address. If lookup 376 fails, then fall back to whatever the hostname property would 377 have returned. 378 379 @return hostname string 380 """ 381 return _reverse_lookup_from_config(self.hostname) 382 383 384 @staticmethod 385 def get_server_url(url): 386 """Get the devserver url from a repo url, which includes build info. 387 388 @param url: A job repo url. 389 390 @return A devserver url, e.g., http://127.0.0.10:8080 391 """ 392 res = urlparse.urlparse(url) 393 if res.netloc: 394 return res.scheme + '://' + res.netloc 395 396 397 @classmethod 398 def get_devserver_load_wrapper(cls, devserver, timeout_sec, output): 399 """A wrapper function to call get_devserver_load in parallel. 400 401 @param devserver: url of the devserver. 402 @param timeout_sec: Number of seconds before time out the devserver 403 call. 404 @param output: An output queue to save results to. 405 """ 406 load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0) 407 if load: 408 load['devserver'] = devserver 409 output.put(load) 410 411 412 @classmethod 413 def get_devserver_load(cls, devserver, 414 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 415 """Returns True if the |devserver| is healthy to stage build. 416 417 @param devserver: url of the devserver. 418 @param timeout_min: How long to wait in minutes before deciding the 419 the devserver is not up (float). 420 421 @return: A dictionary of the devserver's load. 422 423 """ 424 call = cls._build_call(devserver, 'check_health') 425 @remote_devserver_call(timeout_min=timeout_min) 426 def get_load(devserver=devserver): 427 """Inner method that makes the call.""" 428 return cls.run_call(call, timeout=timeout_min*60) 429 430 try: 431 return json.load(cStringIO.StringIO(get_load(devserver=devserver))) 432 except Exception as e: 433 logging.error('Devserver call failed: "%s", timeout: %s seconds,' 434 ' Error: %s', call, timeout_min * 60, e) 435 436 437 @classmethod 438 def is_free_disk_ok(cls, load): 439 """Check if a devserver has enough free disk. 440 441 @param load: A dict of the load of the devserver. 442 443 @return: True if the devserver has enough free disk or disk check is 444 skipped in global config. 445 446 """ 447 if SKIP_DEVSERVER_HEALTH_CHECK: 448 logging.debug('devserver health check is skipped.') 449 elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB: 450 return False 451 452 return True 453 454 455 @classmethod 456 def is_apache_client_count_ok(cls, load): 457 """Check if a devserver has enough Apache connections available. 458 459 Apache server by default has maximum of 150 concurrent connections. If 460 a devserver has too many live connections, it likely indicates the 461 server is busy handling many long running download requests, e.g., 462 downloading stateful partitions. It is better not to add more requests 463 to it. 464 465 @param load: A dict of the load of the devserver. 466 467 @return: True if the devserver has enough Apache connections available, 468 or disk check is skipped in global config. 469 470 """ 471 if SKIP_DEVSERVER_HEALTH_CHECK: 472 logging.debug('devserver health check is skipped.') 473 elif cls.APACHE_CLIENT_COUNT not in load: 474 logging.debug('Apache client count is not collected from devserver.') 475 elif (load[cls.APACHE_CLIENT_COUNT] > 476 cls._MAX_APACHE_CLIENT_COUNT): 477 return False 478 479 return True 480 481 482 @classmethod 483 def devserver_healthy(cls, devserver, 484 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 485 """Returns True if the |devserver| is healthy to stage build. 486 487 @param devserver: url of the devserver. 488 @param timeout_min: How long to wait in minutes before deciding the 489 the devserver is not up (float). 490 491 @return: True if devserver is healthy. Return False otherwise. 492 493 """ 494 c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy') 495 reason = '' 496 healthy = False 497 load = cls.get_devserver_load(devserver, timeout_min=timeout_min) 498 try: 499 if not load: 500 # Failed to get the load of devserver. 501 reason = '(1) Failed to get load.' 502 return False 503 504 apache_ok = cls.is_apache_client_count_ok(load) 505 if not apache_ok: 506 reason = '(2) Apache client count too high.' 507 logging.error('Devserver check_health failed. Live Apache client ' 508 'count is too high: %d.', 509 load[cls.APACHE_CLIENT_COUNT]) 510 return False 511 512 disk_ok = cls.is_free_disk_ok(load) 513 if not disk_ok: 514 reason = '(3) Disk space too low.' 515 logging.error('Devserver check_health failed. Free disk space is ' 516 'low. Only %dGB is available.', 517 load[cls.FREE_DISK]) 518 healthy = bool(disk_ok) 519 return disk_ok 520 finally: 521 c.increment(fields={'dev_server': cls(devserver).resolved_hostname, 522 'healthy': healthy, 523 'reason': reason}) 524 # Monitor how many AU processes the devserver is currently running. 525 if load is not None and load.get(DevServer.AU_PROCESS): 526 c_au = metrics.Gauge( 527 'chromeos/autotest/devserver/devserver_au_count') 528 c_au.set( 529 load.get(DevServer.AU_PROCESS), 530 fields={'dev_server': cls(devserver).resolved_hostname}) 531 532 533 @staticmethod 534 def _build_call(host, method, **kwargs): 535 """Build a URL to |host| that calls |method|, passing |kwargs|. 536 537 Builds a URL that calls |method| on the dev server defined by |host|, 538 passing a set of key/value pairs built from the dict |kwargs|. 539 540 @param host: a string that is the host basename e.g. http://server:90. 541 @param method: the dev server method to call. 542 @param kwargs: a dict mapping arg names to arg values. 543 @return the URL string. 544 """ 545 argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems())) 546 return "%(host)s/%(method)s?%(argstr)s" % dict( 547 host=host, method=method, argstr=argstr) 548 549 550 def build_call(self, method, **kwargs): 551 """Builds a devserver RPC string that is used by 'run_call()'. 552 553 @param method: remote devserver method to call. 554 """ 555 return self._build_call(self._devserver, method, **kwargs) 556 557 558 @classmethod 559 def build_all_calls(cls, method, **kwargs): 560 """Builds a list of URLs that makes RPC calls on all devservers. 561 562 Build a URL that calls |method| on the dev server, passing a set 563 of key/value pairs built from the dict |kwargs|. 564 565 @param method: the dev server method to call. 566 @param kwargs: a dict mapping arg names to arg values 567 568 @return the URL string 569 """ 570 calls = [] 571 # Note we use cls.servers as servers is class specific. 572 for server in cls.servers(): 573 if cls.devserver_healthy(server): 574 calls.append(cls._build_call(server, method, **kwargs)) 575 576 return calls 577 578 579 @classmethod 580 def run_call(cls, call, readline=False, timeout=None): 581 """Invoke a given devserver call using urllib.open. 582 583 Open the URL with HTTP, and return the text of the response. Exceptions 584 may be raised as for urllib2.urlopen(). 585 586 @param call: a url string that calls a method to a devserver. 587 @param readline: whether read http response line by line. 588 @param timeout: The timeout seconds for this urlopen call. 589 590 @return the results of this call. 591 """ 592 if timeout is not None: 593 return utils.urlopen_socket_timeout( 594 call, timeout=timeout).read() 595 elif readline: 596 response = urllib2.urlopen(call) 597 return [line.rstrip() for line in response] 598 else: 599 return urllib2.urlopen(call).read() 600 601 602 @staticmethod 603 def servers(): 604 """Returns a list of servers that can serve as this type of server.""" 605 raise NotImplementedError() 606 607 608 @classmethod 609 def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT, 610 unrestricted_only=False): 611 """Get the devservers in the same subnet of the given ip. 612 613 @param ip: The IP address of a dut to look for devserver. 614 @param mask_bits: Number of mask bits. Default is 19. 615 @param unrestricted_only: Set to True to select from devserver in 616 unrestricted subnet only. Default is False. 617 618 @return: A list of devservers in the same subnet of the given ip. 619 620 """ 621 # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so 622 # we need a dict to return the full devserver path once the IPs are 623 # filtered in get_servers_in_same_subnet. 624 server_names = {} 625 all_devservers = [] 626 devservers = (cls.get_unrestricted_devservers() if unrestricted_only 627 else cls.servers()) 628 for server in devservers: 629 server_name = get_hostname(server) 630 server_names[server_name] = server 631 all_devservers.append(server_name) 632 if not all_devservers: 633 devserver_type = 'unrestricted only' if unrestricted_only else 'all' 634 raise DevServerFailToLocateException( 635 'Fail to locate a devserver for dut %s in %s devservers' 636 % (ip, devserver_type)) 637 638 devservers = utils.get_servers_in_same_subnet(ip, mask_bits, 639 all_devservers) 640 return [server_names[s] for s in devservers] 641 642 643 @classmethod 644 def get_unrestricted_devservers( 645 cls, restricted_subnets=utils.RESTRICTED_SUBNETS): 646 """Get the devservers not in any restricted subnet specified in 647 restricted_subnets. 648 649 @param restricted_subnets: A list of restriected subnets. 650 651 @return: A list of devservers not in any restricted subnet. 652 653 """ 654 if not restricted_subnets: 655 return cls.servers() 656 657 devservers = [] 658 for server in cls.servers(): 659 server_name = get_hostname(server) 660 if not utils.get_restricted_subnet(server_name, restricted_subnets): 661 devservers.append(server) 662 return devservers 663 664 665 @classmethod 666 def get_healthy_devserver(cls, build, devservers, ban_list=None): 667 """"Get a healthy devserver instance from the list of devservers. 668 669 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 670 @param devservers: The devserver list to be chosen out a healthy one. 671 @param ban_list: The blacklist of devservers we don't want to choose. 672 Default is None. 673 674 @return: A DevServer object of a healthy devserver. Return None if no 675 healthy devserver is found. 676 677 """ 678 logging.debug('Pick one healthy devserver from %r', devservers) 679 while devservers: 680 hash_index = hash(build) % len(devservers) 681 devserver = devservers.pop(hash_index) 682 logging.debug('Check health for %s', devserver) 683 if ban_list and devserver in ban_list: 684 continue 685 686 if cls.devserver_healthy(devserver): 687 logging.debug('Pick %s', devserver) 688 return cls(devserver) 689 690 691 @classmethod 692 def get_available_devservers(cls, hostname=None, 693 prefer_local_devserver=PREFER_LOCAL_DEVSERVER, 694 restricted_subnets=utils.RESTRICTED_SUBNETS): 695 """Get devservers in the same subnet of the given hostname. 696 697 @param hostname: Hostname of a DUT to choose devserver for. 698 699 @return: A tuple of (devservers, can_retry), devservers is a list of 700 devservers that's available for the given hostname. can_retry 701 is a flag that indicate if caller can retry the selection of 702 devserver if no devserver in the returned devservers can be 703 used. For example, if hostname is in a restricted subnet, 704 can_retry will be False. 705 """ 706 logging.info('Getting devservers for host: %s', hostname) 707 host_ip = None 708 if hostname: 709 host_ip = bin_utils.get_ip_address(hostname) 710 if not host_ip: 711 logging.error('Failed to get IP address of %s. Will pick a ' 712 'devserver without subnet constraint.', hostname) 713 714 if not host_ip: 715 return cls.get_unrestricted_devservers(restricted_subnets), False 716 717 # Go through all restricted subnet settings and check if the DUT is 718 # inside a restricted subnet. If so, only return the devservers in the 719 # restricted subnet and doesn't allow retry. 720 if host_ip and restricted_subnets: 721 for subnet_ip, mask_bits in restricted_subnets: 722 if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits): 723 logging.debug('The host %s (%s) is in a restricted subnet. ' 724 'Try to locate a devserver inside subnet ' 725 '%s:%d.', hostname, host_ip, subnet_ip, 726 mask_bits) 727 devservers = cls.get_devservers_in_same_subnet( 728 subnet_ip, mask_bits) 729 return devservers, False 730 731 # If prefer_local_devserver is set to True and the host is not in 732 # restricted subnet, pick a devserver in the same subnet if possible. 733 # Set can_retry to True so it can pick a different devserver if all 734 # devservers in the same subnet are down. 735 if prefer_local_devserver: 736 return (cls.get_devservers_in_same_subnet( 737 host_ip, DEFAULT_SUBNET_MASKBIT, True), True) 738 739 return cls.get_unrestricted_devservers(restricted_subnets), False 740 741 742 @classmethod 743 def resolve(cls, build, hostname=None, ban_list=None): 744 """"Resolves a build to a devserver instance. 745 746 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 747 @param hostname: The hostname of dut that requests a devserver. It's 748 used to make sure a devserver in the same subnet is 749 preferred. 750 @param ban_list: The blacklist of devservers shouldn't be chosen. 751 752 @raise DevServerException: If no devserver is available. 753 """ 754 tried_devservers = set() 755 devservers, can_retry = cls.get_available_devservers(hostname) 756 if devservers: 757 tried_devservers |= set(devservers) 758 759 devserver = cls.get_healthy_devserver(build, devservers, 760 ban_list=ban_list) 761 762 if not devserver and can_retry: 763 # Find available devservers without dut location constrain. 764 devservers, _ = cls.get_available_devservers() 765 devserver = cls.get_healthy_devserver(build, devservers, 766 ban_list=ban_list) 767 if devservers: 768 tried_devservers |= set(devservers) 769 if devserver: 770 return devserver 771 else: 772 error_msg = ('All devservers are currently down: %s. ' 773 'dut hostname: %s' % 774 (tried_devservers, hostname)) 775 logging.error(error_msg) 776 raise DevServerException(error_msg) 777 778 779 @classmethod 780 def random(cls): 781 """Return a random devserver that's available. 782 783 Devserver election in `resolve` method is based on a hash of the 784 build that a caller wants to stage. The purpose is that different 785 callers requesting for the same build can get the same devserver, 786 while the lab is able to distribute different builds across all 787 devservers. That helps to reduce the duplication of builds across 788 all devservers. 789 This function returns a random devserver, by passing a random 790 pseudo build name to `resolve `method. 791 """ 792 return cls.resolve(build=str(time.time())) 793 794 795class CrashServer(DevServer): 796 """Class of DevServer that symbolicates crash dumps.""" 797 798 @staticmethod 799 def servers(): 800 return _get_crash_server_list() 801 802 803 @remote_devserver_call() 804 def symbolicate_dump(self, minidump_path, build): 805 """Ask the devserver to symbolicate the dump at minidump_path. 806 807 Stage the debug symbols for |build| and, if that works, ask the 808 devserver to symbolicate the dump at |minidump_path|. 809 810 @param minidump_path: the on-disk path of the minidump. 811 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 812 whose debug symbols are needed for symbolication. 813 @return The contents of the stack trace 814 @raise DevServerException upon any return code that's not HTTP OK. 815 """ 816 try: 817 import requests 818 except ImportError: 819 logging.warning("Can't 'import requests' to connect to dev server.") 820 return '' 821 f = {'dev_server': self.resolved_hostname} 822 c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump') 823 c.increment(fields=f) 824 # Symbolicate minidump. 825 m = 'chromeos/autotest/crashserver/symbolicate_dump_duration' 826 with metrics.SecondsTimer(m, fields=f): 827 call = self.build_call('symbolicate_dump', 828 archive_url=_get_image_storage_server() + build) 829 request = requests.post( 830 call, files={'minidump': open(minidump_path, 'rb')}) 831 if request.status_code == requests.codes.OK: 832 return request.text 833 834 error_fd = cStringIO.StringIO(request.text) 835 raise urllib2.HTTPError( 836 call, request.status_code, request.text, request.headers, 837 error_fd) 838 839 840 @classmethod 841 def get_available_devservers(cls, hostname): 842 """Get all available crash servers. 843 844 Crash server election doesn't need to count the location of hostname. 845 846 @param hostname: Hostname of a DUT to choose devserver for. 847 848 @return: A tuple of (all crash servers, False). can_retry is set to 849 False, as all crash servers are returned. There is no point to 850 retry. 851 """ 852 return cls.servers(), False 853 854 855class ImageServerBase(DevServer): 856 """Base class for devservers used to stage builds. 857 858 CrOS and Android builds are staged in different ways as they have different 859 sets of artifacts. This base class abstracts the shared functions between 860 the two types of ImageServer. 861 """ 862 863 @classmethod 864 def servers(cls): 865 """Returns a list of servers that can serve as a desired type of 866 devserver. 867 """ 868 return _get_dev_server_list() 869 870 871 def _get_image_url(self, image): 872 """Returns the url of the directory for this image on the devserver. 873 874 @param image: the image that was fetched. 875 """ 876 image = self.translate(image) 877 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 878 type=str) 879 return (url_pattern % (self.url(), image)).replace('update', 'static') 880 881 882 @staticmethod 883 def create_metadata(server_name, image, artifacts=None, files=None): 884 """Create a metadata dictionary given the staged items. 885 886 The metadata can be send to metadata db along with stats. 887 888 @param server_name: name of the devserver, e.g 172.22.33.44. 889 @param image: The name of the image. 890 @param artifacts: A list of artifacts. 891 @param files: A list of files. 892 893 @return A metadata dictionary. 894 895 """ 896 metadata = {'devserver': server_name, 897 'image': image, 898 '_type': 'devserver'} 899 if artifacts: 900 metadata['artifacts'] = ' '.join(artifacts) 901 if files: 902 metadata['files'] = ' '.join(files) 903 return metadata 904 905 906 @classmethod 907 def run_ssh_call(cls, call, readline=False, timeout=None): 908 """Construct an ssh-based rpc call, and execute it. 909 910 @param call: a url string that calls a method to a devserver. 911 @param readline: whether read http response line by line. 912 @param timeout: The timeout seconds for ssh call. 913 914 @return the results of this call. 915 """ 916 hostname = get_hostname(call) 917 ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call)) 918 timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60 919 try: 920 result = utils.run(ssh_call, timeout=timeout_seconds) 921 except error.CmdError as e: 922 logging.debug('Error occurred with exit_code %d when executing the ' 923 'ssh call: %s.', e.result_obj.exit_status, 924 e.result_obj.stderr) 925 c = metrics.Counter('chromeos/autotest/devserver/ssh_failure') 926 c.increment(fields={'dev_server': hostname}) 927 raise 928 response = result.stdout 929 930 # If the curl command's returned HTTP response contains certain 931 # exception string, raise the DevServerException of the response. 932 if 'DownloaderException' in response: 933 raise DevServerException(_strip_http_message(response)) 934 935 if readline: 936 # Remove line terminators and trailing whitespace 937 response = response.splitlines() 938 return [line.rstrip() for line in response] 939 940 return response 941 942 943 @classmethod 944 def run_call(cls, call, readline=False, timeout=None): 945 """Invoke a given devserver call using urllib.open or ssh. 946 947 Open the URL with HTTP or SSH-based HTTP, and return the text of the 948 response. Exceptions may be raised as for urllib2.urlopen() or 949 utils.run(). 950 951 @param call: a url string that calls a method to a devserver. 952 @param readline: whether read http response line by line. 953 @param timeout: The timeout seconds for urlopen call or ssh call. 954 955 @return the results of this call. 956 """ 957 server_name = get_hostname(call) 958 is_in_restricted_subnet = utils.get_restricted_subnet( 959 server_name, utils.RESTRICTED_SUBNETS) 960 if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or 961 not is_in_restricted_subnet): 962 return super(ImageServerBase, cls).run_call( 963 call, readline=readline, timeout=timeout) 964 else: 965 return cls.run_ssh_call( 966 call, readline=readline, timeout=timeout) 967 968 969 @classmethod 970 def download_file(cls, remote_file, local_file, timeout=None): 971 """Download file from devserver. 972 973 The format of remote_file should be: 974 http://devserver_ip:8082/static/board/... 975 976 @param remote_file: The URL of the file on devserver that need to be 977 downloaded. 978 @param local_file: The path of the file saved to local. 979 @param timeout: The timeout seconds for this call. 980 """ 981 response = cls.run_call(remote_file, timeout=timeout) 982 with open(local_file, 'w') as out_log: 983 out_log.write(response) 984 985 986 def _poll_is_staged(self, **kwargs): 987 """Polling devserver.is_staged until all artifacts are staged. 988 989 @param kwargs: keyword arguments to make is_staged devserver call. 990 991 @return: True if all artifacts are staged in devserver. 992 """ 993 call = self.build_call('is_staged', **kwargs) 994 995 def all_staged(): 996 """Call devserver.is_staged rpc to check if all files are staged. 997 998 @return: True if all artifacts are staged in devserver. False 999 otherwise. 1000 @rasies DevServerException, the exception is a wrapper of all 1001 exceptions that were raised when devserver tried to download 1002 the artifacts. devserver raises an HTTPError or a CmdError 1003 when an exception was raised in the code. Such exception 1004 should be re-raised here to stop the caller from waiting. 1005 If the call to devserver failed for connection issue, a 1006 URLError exception is raised, and caller should retry the 1007 call to avoid such network flakiness. 1008 1009 """ 1010 try: 1011 result = self.run_call(call) 1012 logging.debug('whether artifact is staged: %r', result) 1013 return result == 'True' 1014 except urllib2.HTTPError as e: 1015 error_markup = e.read() 1016 raise DevServerException(_strip_http_message(error_markup)) 1017 except urllib2.URLError as e: 1018 # Could be connection issue, retry it. 1019 # For example: <urlopen error [Errno 111] Connection refused> 1020 logging.error('URLError happens in is_stage: %r', e) 1021 return False 1022 except error.CmdError as e: 1023 # Retry if SSH failed to connect to the devserver. 1024 logging.warning('CmdError happens in is_stage: %r, will retry', e) 1025 return False 1026 1027 bin_utils.poll_for_condition( 1028 all_staged, 1029 exception=bin_utils.TimeoutError(), 1030 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60, 1031 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL) 1032 1033 return True 1034 1035 1036 def _call_and_wait(self, call_name, error_message, 1037 expected_response=SUCCESS, **kwargs): 1038 """Helper method to make a urlopen call, and wait for artifacts staged. 1039 1040 @param call_name: name of devserver rpc call. 1041 @param error_message: Error message to be thrown if response does not 1042 match expected_response. 1043 @param expected_response: Expected response from rpc, default to 1044 |Success|. If it's set to None, do not compare 1045 the actual response. Any response is consider 1046 to be good. 1047 @param kwargs: keyword arguments to make is_staged devserver call. 1048 1049 @return: The response from rpc. 1050 @raise DevServerException upon any return code that's expected_response. 1051 1052 """ 1053 call = self.build_call(call_name, async=True, **kwargs) 1054 try: 1055 response = self.run_call(call) 1056 logging.debug('response for RPC: %r', response) 1057 if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response: 1058 logging.debug('Proxy error happens in RPC call, ' 1059 'will retry in 30 seconds') 1060 time.sleep(30) 1061 raise DevServerOverloadException() 1062 except httplib.BadStatusLine as e: 1063 logging.error(e) 1064 raise DevServerException('Received Bad Status line, Devserver %s ' 1065 'might have gone down while handling ' 1066 'the call: %s' % (self.url(), call)) 1067 1068 if expected_response and not response == expected_response: 1069 raise DevServerException(error_message) 1070 1071 # `os_type` is needed in build a devserver call, but not needed for 1072 # wait_for_artifacts_staged, since that method is implemented by 1073 # each ImageServerBase child class. 1074 if 'os_type' in kwargs: 1075 del kwargs['os_type'] 1076 self.wait_for_artifacts_staged(**kwargs) 1077 return response 1078 1079 1080 def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs): 1081 """Tell the devserver to download and stage |artifacts| from |image| 1082 specified by kwargs. 1083 1084 This is the main call point for staging any specific artifacts for a 1085 given build. To see the list of artifacts one can stage see: 1086 1087 ~src/platfrom/dev/artifact_info.py. 1088 1089 This is maintained along with the actual devserver code. 1090 1091 @param artifacts: A list of artifacts. 1092 @param files: A list of files to stage. 1093 @param archive_url: Optional parameter that has the archive_url to stage 1094 this artifact from. Default is specified in autotest config + 1095 image. 1096 @param kwargs: keyword arguments that specify the build information, to 1097 make stage devserver call. 1098 1099 @raise DevServerException upon any return code that's not HTTP OK. 1100 """ 1101 if not archive_url: 1102 archive_url = _get_storage_server_for_artifacts(artifacts) + build 1103 1104 artifacts_arg = ','.join(artifacts) if artifacts else '' 1105 files_arg = ','.join(files) if files else '' 1106 error_message = ("staging %s for %s failed;" 1107 "HTTP OK not accompanied by 'Success'." % 1108 ('artifacts=%s files=%s ' % (artifacts_arg, files_arg), 1109 build)) 1110 1111 staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' % 1112 (build, artifacts, files, archive_url)) 1113 logging.info('Staging artifacts on devserver %s: %s', 1114 self.url(), staging_info) 1115 success = False 1116 try: 1117 arguments = {'archive_url': archive_url, 1118 'artifacts': artifacts_arg, 1119 'files': files_arg} 1120 if kwargs: 1121 arguments.update(kwargs) 1122 # TODO(akeshet): canonicalize artifacts_arg before using it as a 1123 # metric field (as it stands it is a not-very-well-controlled 1124 # string). 1125 f = {'artifacts': artifacts_arg, 1126 'dev_server': self.resolved_hostname} 1127 with metrics.SecondsTimer( 1128 'chromeos/autotest/devserver/stage_artifact_duration', 1129 fields=f): 1130 self.call_and_wait(call_name='stage', error_message=error_message, 1131 **arguments) 1132 logging.info('Finished staging artifacts: %s', staging_info) 1133 success = True 1134 except (bin_utils.TimeoutError, error.TimeoutException): 1135 logging.error('stage_artifacts timed out: %s', staging_info) 1136 raise DevServerException( 1137 'stage_artifacts timed out: %s' % staging_info) 1138 finally: 1139 f = {'success': success, 1140 'artifacts': artifacts_arg, 1141 'dev_server': self.resolved_hostname} 1142 metrics.Counter('chromeos/autotest/devserver/stage_artifact' 1143 ).increment(fields=f) 1144 1145 1146 def call_and_wait(self, *args, **kwargs): 1147 """Helper method to make a urlopen call, and wait for artifacts staged. 1148 1149 This method needs to be overridden in the subclass to implement the 1150 logic to call _call_and_wait. 1151 """ 1152 raise NotImplementedError 1153 1154 1155 def _trigger_download(self, build, artifacts, files, synchronous=True, 1156 **kwargs_build_info): 1157 """Tell the devserver to download and stage image specified in 1158 kwargs_build_info. 1159 1160 Tells the devserver to fetch |image| from the image storage server 1161 named by _get_image_storage_server(). 1162 1163 If |synchronous| is True, waits for the entire download to finish 1164 staging before returning. Otherwise only the artifacts necessary 1165 to start installing images onto DUT's will be staged before returning. 1166 A caller can then call finish_download to guarantee the rest of the 1167 artifacts have finished staging. 1168 1169 @param synchronous: if True, waits until all components of the image are 1170 staged before returning. 1171 @param kwargs_build_info: Dictionary of build information. 1172 For CrOS, it is None as build is the CrOS image name. 1173 For Android, it is {'target': target, 1174 'build_id': build_id, 1175 'branch': branch} 1176 1177 @raise DevServerException upon any return code that's not HTTP OK. 1178 1179 """ 1180 if kwargs_build_info: 1181 archive_url = None 1182 else: 1183 archive_url = _get_image_storage_server() + build 1184 error_message = ("trigger_download for %s failed;" 1185 "HTTP OK not accompanied by 'Success'." % build) 1186 kwargs = {'archive_url': archive_url, 1187 'artifacts': artifacts, 1188 'files': files, 1189 'error_message': error_message} 1190 if kwargs_build_info: 1191 kwargs.update(kwargs_build_info) 1192 1193 logging.info('trigger_download starts for %s', build) 1194 try: 1195 response = self.call_and_wait(call_name='stage', **kwargs) 1196 logging.info('trigger_download finishes for %s', build) 1197 except (bin_utils.TimeoutError, error.TimeoutException): 1198 logging.error('trigger_download timed out for %s.', build) 1199 raise DevServerException( 1200 'trigger_download timed out for %s.' % build) 1201 was_successful = response == SUCCESS 1202 if was_successful and synchronous: 1203 self._finish_download(build, artifacts, files, **kwargs_build_info) 1204 1205 1206 def _finish_download(self, build, artifacts, files, **kwargs_build_info): 1207 """Tell the devserver to finish staging image specified in 1208 kwargs_build_info. 1209 1210 If trigger_download is called with synchronous=False, it will return 1211 before all artifacts have been staged. This method contacts the 1212 devserver and blocks until all staging is completed and should be 1213 called after a call to trigger_download. 1214 1215 @param kwargs_build_info: Dictionary of build information. 1216 For CrOS, it is None as build is the CrOS image name. 1217 For Android, it is {'target': target, 1218 'build_id': build_id, 1219 'branch': branch} 1220 1221 @raise DevServerException upon any return code that's not HTTP OK. 1222 """ 1223 archive_url = _get_image_storage_server() + build 1224 error_message = ("finish_download for %s failed;" 1225 "HTTP OK not accompanied by 'Success'." % build) 1226 kwargs = {'archive_url': archive_url, 1227 'artifacts': artifacts, 1228 'files': files, 1229 'error_message': error_message} 1230 if kwargs_build_info: 1231 kwargs.update(kwargs_build_info) 1232 try: 1233 self.call_and_wait(call_name='stage', **kwargs) 1234 except (bin_utils.TimeoutError, error.TimeoutException): 1235 logging.error('finish_download timed out for %s', build) 1236 raise DevServerException( 1237 'finish_download timed out for %s.' % build) 1238 1239 1240 @remote_devserver_call() 1241 def locate_file(self, file_name, artifacts, build, build_info): 1242 """Locate a file with the given file_name on devserver. 1243 1244 This method calls devserver RPC `locate_file` to look up a file with 1245 the given file name inside specified build artifacts. 1246 1247 @param file_name: Name of the file to look for a file. 1248 @param artifacts: A list of artifact names to search for the file. 1249 @param build: Name of the build. For Android, it's None as build_info 1250 should be used. 1251 @param build_info: Dictionary of build information. 1252 For CrOS, it is None as build is the CrOS image name. 1253 For Android, it is {'target': target, 1254 'build_id': build_id, 1255 'branch': branch} 1256 1257 @return: A devserver url to the file. 1258 @raise DevServerException upon any return code that's not HTTP OK. 1259 """ 1260 if not build and not build_info: 1261 raise DevServerException('You must specify build information to ' 1262 'look for file %s in artifacts %s.' % 1263 (file_name, artifacts)) 1264 kwargs = {'file_name': file_name, 1265 'artifacts': artifacts} 1266 if build_info: 1267 build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info 1268 kwargs.update(build_info) 1269 # Devserver treats Android and Brillo build in the same way as they 1270 # are both retrieved from Launch Control and have similar build 1271 # artifacts. Therefore, os_type for devserver calls is `android` for 1272 # both Android and Brillo builds. 1273 kwargs['os_type'] = 'android' 1274 else: 1275 build_path = build 1276 kwargs['build'] = build 1277 call = self.build_call('locate_file', async=False, **kwargs) 1278 try: 1279 file_path = self.run_call(call) 1280 return os.path.join(self.url(), 'static', build_path, file_path) 1281 except httplib.BadStatusLine as e: 1282 logging.error(e) 1283 raise DevServerException('Received Bad Status line, Devserver %s ' 1284 'might have gone down while handling ' 1285 'the call: %s' % (self.url(), call)) 1286 1287 1288 @remote_devserver_call() 1289 def list_control_files(self, build, suite_name=''): 1290 """Ask the devserver to list all control files for |build|. 1291 1292 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1293 whose control files the caller wants listed. 1294 @param suite_name: The name of the suite for which we require control 1295 files. 1296 @return None on failure, or a list of control file paths 1297 (e.g. server/site_tests/autoupdate/control) 1298 @raise DevServerException upon any return code that's not HTTP OK. 1299 """ 1300 build = self.translate(build) 1301 call = self.build_call('controlfiles', build=build, 1302 suite_name=suite_name) 1303 return self.run_call(call, readline=True) 1304 1305 1306 @remote_devserver_call() 1307 def get_control_file(self, build, control_path): 1308 """Ask the devserver for the contents of a control file. 1309 1310 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1311 whose control file the caller wants to fetch. 1312 @param control_path: The file to fetch 1313 (e.g. server/site_tests/autoupdate/control) 1314 @return The contents of the desired file. 1315 @raise DevServerException upon any return code that's not HTTP OK. 1316 """ 1317 build = self.translate(build) 1318 call = self.build_call('controlfiles', build=build, 1319 control_path=control_path) 1320 return self.run_call(call) 1321 1322 1323 @remote_devserver_call() 1324 def list_suite_controls(self, build, suite_name=''): 1325 """Ask the devserver to list contents of all control files for |build|. 1326 1327 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1328 whose control files' contents the caller wants returned. 1329 @param suite_name: The name of the suite for which we require control 1330 files. 1331 @return None on failure, or a dict of contents of all control files 1332 (e.g. {'path1': "#Copyright controls ***", ..., 1333 pathX': "#Copyright controls ***"} 1334 @raise DevServerException upon any return code that's not HTTP OK. 1335 """ 1336 build = self.translate(build) 1337 call = self.build_call('list_suite_controls', build=build, 1338 suite_name=suite_name) 1339 return json.load(cStringIO.StringIO(self.run_call(call))) 1340 1341 1342class ImageServer(ImageServerBase): 1343 """Class for DevServer that handles RPCs related to CrOS images. 1344 1345 The calls to devserver to stage artifacts, including stage and download, are 1346 made in async mode. That is, when caller makes an RPC |stage| to request 1347 devserver to stage certain artifacts, devserver handles the call and starts 1348 staging artifacts in a new thread, and return |Success| without waiting for 1349 staging being completed. When caller receives message |Success|, it polls 1350 devserver's is_staged call until all artifacts are staged. 1351 Such mechanism is designed to prevent cherrypy threads in devserver being 1352 running out, as staging artifacts might take long time, and cherrypy starts 1353 with a fixed number of threads that handle devserver rpc. 1354 """ 1355 1356 class ArtifactUrls(object): 1357 """A container for URLs of staged artifacts. 1358 1359 Attributes: 1360 full_payload: URL for downloading a staged full release update 1361 mton_payload: URL for downloading a staged M-to-N release update 1362 nton_payload: URL for downloading a staged N-to-N release update 1363 1364 """ 1365 def __init__(self, full_payload=None, mton_payload=None, 1366 nton_payload=None): 1367 self.full_payload = full_payload 1368 self.mton_payload = mton_payload 1369 self.nton_payload = nton_payload 1370 1371 1372 def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''): 1373 """Polling devserver.is_staged until all artifacts are staged. 1374 1375 @param archive_url: Google Storage URL for the build. 1376 @param artifacts: Comma separated list of artifacts to download. 1377 @param files: Comma separated list of files to download. 1378 @return: True if all artifacts are staged in devserver. 1379 """ 1380 kwargs = {'archive_url': archive_url, 1381 'artifacts': artifacts, 1382 'files': files} 1383 return self._poll_is_staged(**kwargs) 1384 1385 1386 @remote_devserver_call() 1387 def call_and_wait(self, call_name, archive_url, artifacts, files, 1388 error_message, expected_response=SUCCESS): 1389 """Helper method to make a urlopen call, and wait for artifacts staged. 1390 1391 @param call_name: name of devserver rpc call. 1392 @param archive_url: Google Storage URL for the build.. 1393 @param artifacts: Comma separated list of artifacts to download. 1394 @param files: Comma separated list of files to download. 1395 @param expected_response: Expected response from rpc, default to 1396 |Success|. If it's set to None, do not compare 1397 the actual response. Any response is consider 1398 to be good. 1399 @param error_message: Error message to be thrown if response does not 1400 match expected_response. 1401 1402 @return: The response from rpc. 1403 @raise DevServerException upon any return code that's expected_response. 1404 1405 """ 1406 kwargs = {'archive_url': archive_url, 1407 'artifacts': artifacts, 1408 'files': files} 1409 return self._call_and_wait(call_name, error_message, 1410 expected_response, **kwargs) 1411 1412 1413 @remote_devserver_call() 1414 def stage_artifacts(self, image=None, artifacts=None, files='', 1415 archive_url=None): 1416 """Tell the devserver to download and stage |artifacts| from |image|. 1417 1418 This is the main call point for staging any specific artifacts for a 1419 given build. To see the list of artifacts one can stage see: 1420 1421 ~src/platfrom/dev/artifact_info.py. 1422 1423 This is maintained along with the actual devserver code. 1424 1425 @param image: the image to fetch and stage. 1426 @param artifacts: A list of artifacts. 1427 @param files: A list of files to stage. 1428 @param archive_url: Optional parameter that has the archive_url to stage 1429 this artifact from. Default is specified in autotest config + 1430 image. 1431 1432 @raise DevServerException upon any return code that's not HTTP OK. 1433 """ 1434 if not artifacts and not files: 1435 raise DevServerException('Must specify something to stage.') 1436 image = self.translate(image) 1437 self._stage_artifacts(image, artifacts, files, archive_url) 1438 1439 1440 @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS) 1441 def list_image_dir(self, image): 1442 """List the contents of the image stage directory, on the devserver. 1443 1444 @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>. 1445 1446 @raise DevServerException upon any return code that's not HTTP OK. 1447 """ 1448 image = self.translate(image) 1449 logging.info('Requesting contents from devserver %s for image %s', 1450 self.url(), image) 1451 archive_url = _get_storage_server_for_artifacts() + image 1452 call = self.build_call('list_image_dir', archive_url=archive_url) 1453 response = self.run_call(call, readline=True) 1454 for line in response: 1455 logging.info(line) 1456 1457 1458 def trigger_download(self, image, synchronous=True): 1459 """Tell the devserver to download and stage |image|. 1460 1461 Tells the devserver to fetch |image| from the image storage server 1462 named by _get_image_storage_server(). 1463 1464 If |synchronous| is True, waits for the entire download to finish 1465 staging before returning. Otherwise only the artifacts necessary 1466 to start installing images onto DUT's will be staged before returning. 1467 A caller can then call finish_download to guarantee the rest of the 1468 artifacts have finished staging. 1469 1470 @param image: the image to fetch and stage. 1471 @param synchronous: if True, waits until all components of the image are 1472 staged before returning. 1473 1474 @raise DevServerException upon any return code that's not HTTP OK. 1475 1476 """ 1477 image = self.translate(image) 1478 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE 1479 self._trigger_download(image, artifacts, files='', 1480 synchronous=synchronous) 1481 1482 1483 @remote_devserver_call() 1484 def setup_telemetry(self, build): 1485 """Tell the devserver to setup telemetry for this build. 1486 1487 The devserver will stage autotest and then extract the required files 1488 for telemetry. 1489 1490 @param build: the build to setup telemetry for. 1491 1492 @returns path on the devserver that telemetry is installed to. 1493 """ 1494 build = self.translate(build) 1495 archive_url = _get_image_storage_server() + build 1496 call = self.build_call('setup_telemetry', archive_url=archive_url) 1497 try: 1498 response = self.run_call(call) 1499 except httplib.BadStatusLine as e: 1500 logging.error(e) 1501 raise DevServerException('Received Bad Status line, Devserver %s ' 1502 'might have gone down while handling ' 1503 'the call: %s' % (self.url(), call)) 1504 return response 1505 1506 1507 def finish_download(self, image): 1508 """Tell the devserver to finish staging |image|. 1509 1510 If trigger_download is called with synchronous=False, it will return 1511 before all artifacts have been staged. This method contacts the 1512 devserver and blocks until all staging is completed and should be 1513 called after a call to trigger_download. 1514 1515 @param image: the image to fetch and stage. 1516 @raise DevServerException upon any return code that's not HTTP OK. 1517 """ 1518 image = self.translate(image) 1519 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST 1520 self._finish_download(image, artifacts, files='') 1521 1522 1523 def get_update_url(self, image): 1524 """Returns the url that should be passed to the updater. 1525 1526 @param image: the image that was fetched. 1527 """ 1528 image = self.translate(image) 1529 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 1530 type=str) 1531 return (url_pattern % (self.url(), image)) 1532 1533 1534 def get_staged_file_url(self, filename, image): 1535 """Returns the url of a staged file for this image on the devserver.""" 1536 return '/'.join([self._get_image_url(image), filename]) 1537 1538 1539 def get_full_payload_url(self, image): 1540 """Returns a URL to a staged full payload. 1541 1542 @param image: the image that was fetched. 1543 1544 @return A fully qualified URL that can be used for downloading the 1545 payload. 1546 1547 """ 1548 return self._get_image_url(image) + '/update.gz' 1549 1550 1551 def get_test_image_url(self, image): 1552 """Returns a URL to a staged test image. 1553 1554 @param image: the image that was fetched. 1555 1556 @return A fully qualified URL that can be used for downloading the 1557 image. 1558 1559 """ 1560 return self._get_image_url(image) + '/chromiumos_test_image.bin' 1561 1562 1563 @remote_devserver_call() 1564 def get_dependencies_file(self, build): 1565 """Ask the dev server for the contents of the suite dependencies file. 1566 1567 Ask the dev server at |self._dev_server| for the contents of the 1568 pre-processed suite dependencies file (at DEPENDENCIES_FILE) 1569 for |build|. 1570 1571 @param build: The build (e.g. x86-mario-release/R21-2333.0.0) 1572 whose dependencies the caller is interested in. 1573 @return The contents of the dependencies file, which should eval to 1574 a dict of dicts, as per bin_utils/suite_preprocessor.py. 1575 @raise DevServerException upon any return code that's not HTTP OK. 1576 """ 1577 build = self.translate(build) 1578 call = self.build_call('controlfiles', 1579 build=build, control_path=DEPENDENCIES_FILE) 1580 return self.run_call(call) 1581 1582 1583 @remote_devserver_call() 1584 def get_latest_build_in_gs(self, board): 1585 """Ask the devservers for the latest offical build in Google Storage. 1586 1587 @param board: The board for who we want the latest official build. 1588 @return A string of the returned build rambi-release/R37-5868.0.0 1589 @raise DevServerException upon any return code that's not HTTP OK. 1590 """ 1591 call = self.build_call( 1592 'xbuddy_translate/remote/%s/latest-official' % board, 1593 image_dir=_get_image_storage_server()) 1594 image_name = self.run_call(call) 1595 return os.path.dirname(image_name) 1596 1597 1598 def translate(self, build_name): 1599 """Translate the build name if it's in LATEST format. 1600 1601 If the build name is in the format [builder]/LATEST, return the latest 1602 build in Google Storage otherwise return the build name as is. 1603 1604 @param build_name: build_name to check. 1605 1606 @return The actual build name to use. 1607 """ 1608 match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I) 1609 if not match: 1610 return build_name 1611 translated_build = self.get_latest_build_in_gs(match.groups()[0]) 1612 logging.debug('Translated relative build %s to %s', build_name, 1613 translated_build) 1614 return translated_build 1615 1616 1617 @classmethod 1618 @remote_devserver_call() 1619 def get_latest_build(cls, target, milestone=''): 1620 """Ask all the devservers for the latest build for a given target. 1621 1622 @param target: The build target, typically a combination of the board 1623 and the type of build e.g. x86-mario-release. 1624 @param milestone: For latest build set to '', for builds only in a 1625 specific milestone set to a str of format Rxx 1626 (e.g. R16). Default: ''. Since we are dealing with a 1627 webserver sending an empty string, '', ensures that 1628 the variable in the URL is ignored as if it was set 1629 to None. 1630 @return A string of the returned build e.g. R20-2226.0.0. 1631 @raise DevServerException upon any return code that's not HTTP OK. 1632 """ 1633 calls = cls.build_all_calls('latestbuild', target=target, 1634 milestone=milestone) 1635 latest_builds = [] 1636 for call in calls: 1637 latest_builds.append(cls.run_call(call)) 1638 1639 return max(latest_builds, key=version.LooseVersion) 1640 1641 1642 @remote_devserver_call() 1643 def _kill_au_process_for_host(self, **kwargs): 1644 """Kill the triggerred auto_update process if error happens in cros_au. 1645 1646 @param kwargs: Arguments to make kill_au_proc devserver call. 1647 """ 1648 call = self.build_call('kill_au_proc', **kwargs) 1649 response = self.run_call(call) 1650 if not response == 'True': 1651 raise DevServerException( 1652 'Failed to kill the triggerred CrOS auto_update process' 1653 'on devserver %s, the response is %s' % ( 1654 self.url(), response)) 1655 1656 1657 def kill_au_process_for_host(self, host_name, pid): 1658 """Kill the triggerred auto_update process if error happens. 1659 1660 Usually this function is used to clear all potential left au processes 1661 of the given host name. 1662 1663 If pid is specified, the devserver will further check the given pid to 1664 make sure the process is killed. This is used for the case that the au 1665 process has started in background, but then provision fails due to 1666 some unknown issues very fast. In this case, when 'kill_au_proc' is 1667 called, there's no corresponding background track log created for this 1668 ongoing au process, which prevents this RPC call from killing this au 1669 process. 1670 1671 @param host_name: The DUT's hostname. 1672 @param pid: The ongoing au process's pid. 1673 1674 @return: True if successfully kill the auto-update process for host. 1675 """ 1676 kwargs = {'host_name': host_name, 'pid': pid} 1677 try: 1678 self._kill_au_process_for_host(**kwargs) 1679 except DevServerException: 1680 return False 1681 1682 return True 1683 1684 1685 @remote_devserver_call() 1686 def _clean_track_log(self, **kwargs): 1687 """Clean track log for the current auto-update process.""" 1688 call = self.build_call('handler_cleanup', **kwargs) 1689 self.run_call(call) 1690 1691 1692 def clean_track_log(self, host_name, pid): 1693 """Clean track log for the current auto-update process. 1694 1695 @param host_name: The host name to be updated. 1696 @param pid: The auto-update process id. 1697 1698 @return: True if track log is successfully cleaned, False otherwise. 1699 """ 1700 if not pid: 1701 return False 1702 1703 kwargs = {'host_name': host_name, 'pid': pid} 1704 try: 1705 self._clean_track_log(**kwargs) 1706 except DevServerException as e: 1707 logging.debug('Failed to clean track_status_file on ' 1708 'devserver for host %s and process id %s: %s', 1709 host_name, pid, str(e)) 1710 return False 1711 1712 return True 1713 1714 1715 def _get_au_log_filename(self, log_dir, host_name, pid): 1716 """Return the auto-update log's filename.""" 1717 return os.path.join(log_dir, CROS_AU_LOG_FILENAME % ( 1718 host_name, pid)) 1719 1720 def _read_json_response_from_devserver(self, response): 1721 """Reads the json response from the devserver. 1722 1723 This is extracted to its own function so that it can be easily mocked. 1724 @param response: the response for a devserver. 1725 """ 1726 try: 1727 return json.loads(response) 1728 except ValueError as e: 1729 raise DevServerException(e) 1730 1731 1732 @remote_devserver_call() 1733 def _collect_au_log(self, log_dir, **kwargs): 1734 """Collect logs from devserver after cros-update process is finished. 1735 1736 Collect the logs that recording the whole cros-update process, and 1737 write it to sysinfo path of a job. 1738 1739 The example log file name that is stored is like: 1740 '1220-repair/sysinfo/CrOS_update_host_name_pid.log' 1741 1742 @param host_name: the DUT's hostname. 1743 @param pid: the auto-update process id on devserver. 1744 @param log_dir: The directory to save the cros-update process log 1745 retrieved from devserver. 1746 """ 1747 call = self.build_call('collect_cros_au_log', **kwargs) 1748 response = self.run_call(call) 1749 if not os.path.exists(log_dir): 1750 os.mkdir(log_dir) 1751 write_file = self._get_au_log_filename( 1752 log_dir, kwargs['host_name'], kwargs['pid']) 1753 logging.debug('Saving auto-update logs into %s', write_file) 1754 1755 au_logs = self._read_json_response_from_devserver(response) 1756 1757 try: 1758 for k, v in au_logs['host_logs'].items(): 1759 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid']) 1760 log_path = os.path.join(log_dir, log_name) 1761 with open(log_path, 'w') as out_log: 1762 out_log.write(v) 1763 except IOError as e: 1764 raise DevServerException('Failed to write auto-update hostlogs: ' 1765 '%s' % e) 1766 1767 try: 1768 with open(write_file, 'w') as out_log: 1769 out_log.write(au_logs['cros_au_log']) 1770 except: 1771 raise DevServerException('Failed to write auto-update logs into ' 1772 '%s' % write_file) 1773 1774 1775 def collect_au_log(self, host_name, pid, log_dir): 1776 """Collect logs from devserver after cros-update process is finished. 1777 1778 @param host_name: the DUT's hostname. 1779 @param pid: the auto-update process id on devserver. 1780 @param log_dir: The directory to save the cros-update process log 1781 retrieved from devserver. 1782 1783 @return: True if auto-update log is successfully collected, False 1784 otherwise. 1785 """ 1786 if not pid: 1787 return False 1788 1789 kwargs = {'host_name': host_name, 'pid': pid} 1790 try: 1791 self._collect_au_log(log_dir, **kwargs) 1792 except DevServerException as e: 1793 logging.debug('Failed to collect auto-update log on ' 1794 'devserver for host %s and process id %s: %s', 1795 host_name, pid, str(e)) 1796 return False 1797 1798 return True 1799 1800 1801 @remote_devserver_call() 1802 def _trigger_auto_update(self, **kwargs): 1803 """Trigger auto-update by calling devserver.cros_au. 1804 1805 @param kwargs: Arguments to make cros_au devserver call. 1806 1807 @return: a tuple indicates whether the RPC call cros_au succeeds and 1808 the auto-update process id running on devserver. 1809 """ 1810 host_name = kwargs['host_name'] 1811 call = self.build_call('cros_au', async=True, **kwargs) 1812 try: 1813 response = self.run_call(call) 1814 logging.info( 1815 'Received response from devserver for cros_au call: %r', 1816 response) 1817 except httplib.BadStatusLine as e: 1818 logging.error(e) 1819 raise DevServerException('Received Bad Status line, Devserver %s ' 1820 'might have gone down while handling ' 1821 'the call: %s' % (self.url(), call)) 1822 1823 return response 1824 1825 1826 def _wait_for_auto_update_finished(self, pid, **kwargs): 1827 """Polling devserver.get_au_status to get current auto-update status. 1828 1829 The current auto-update status is used to identify whether the update 1830 process is finished. 1831 1832 @param pid: The background process id for auto-update in devserver. 1833 @param kwargs: keyword arguments to make get_au_status devserver call. 1834 1835 @return: True if auto-update is finished for a given dut. 1836 """ 1837 logging.debug('Check the progress for auto-update process %r', pid) 1838 kwargs['pid'] = pid 1839 call = self.build_call('get_au_status', **kwargs) 1840 1841 def all_finished(): 1842 """Call devserver.get_au_status rpc to check if auto-update 1843 is finished. 1844 1845 @return: True if auto-update is finished for a given dut. False 1846 otherwise. 1847 @rasies DevServerException, the exception is a wrapper of all 1848 exceptions that were raised when devserver tried to 1849 download the artifacts. devserver raises an HTTPError or 1850 a CmdError when an exception was raised in the code. Such 1851 exception should be re-raised here to stop the caller from 1852 waiting. If the call to devserver failed for connection 1853 issue, a URLError exception is raised, and caller should 1854 retry the call to avoid such network flakiness. 1855 1856 """ 1857 try: 1858 au_status = self.run_call(call) 1859 response = json.loads(au_status) 1860 # This is a temp fix to fit both dict and tuple returning 1861 # values. The dict check will be removed after a corresponding 1862 # devserver CL is deployed. 1863 if isinstance(response, dict): 1864 if response.get('detailed_error_msg'): 1865 raise DevServerException( 1866 response.get('detailed_error_msg')) 1867 1868 if response.get('finished'): 1869 logging.debug('CrOS auto-update is finished') 1870 return True 1871 else: 1872 logging.debug('Current CrOS auto-update status: %s', 1873 response.get('status')) 1874 return False 1875 1876 if not response[0]: 1877 logging.debug('Current CrOS auto-update status: %s', 1878 response[1]) 1879 return False 1880 else: 1881 logging.debug('CrOS auto-update is finished') 1882 return True 1883 except urllib2.HTTPError as e: 1884 error_markup = e.read() 1885 raise DevServerException(_strip_http_message(error_markup)) 1886 except urllib2.URLError as e: 1887 # Could be connection issue, retry it. 1888 # For example: <urlopen error [Errno 111] Connection refused> 1889 logging.warning('URLError (%r): Retrying connection to ' 1890 'devserver to check auto-update status.', e) 1891 return False 1892 except error.CmdError: 1893 # Retry if SSH failed to connect to the devserver. 1894 logging.warning('CmdError: Retrying SSH connection to check ' 1895 'auto-update status.') 1896 return False 1897 except socket.error as e: 1898 # Could be some temporary devserver connection issues. 1899 logging.warning('Socket Error (%r): Retrying connection to ' 1900 'devserver to check auto-update status.', e) 1901 return False 1902 except ValueError as e: 1903 raise DevServerException( 1904 '%s (Got AU status: %r)' % (str(e), au_status)) 1905 1906 bin_utils.poll_for_condition( 1907 all_finished, 1908 exception=bin_utils.TimeoutError(), 1909 timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60, 1910 sleep_interval=CROS_AU_POLLING_INTERVAL) 1911 1912 return True 1913 1914 1915 def wait_for_auto_update_finished(self, response, **kwargs): 1916 """Processing response of 'cros_au' and polling for auto-update status. 1917 1918 Will wait for the whole auto-update process is finished. 1919 1920 @param response: The response from RPC 'cros_au' 1921 @param kwargs: keyword arguments to make get_au_status devserver call. 1922 1923 @return: a tuple includes two elements. 1924 raised_error: None if everything works well or the raised error. 1925 pid: the auto-update process id on devserver. 1926 """ 1927 1928 pid = 0 1929 raised_error = None 1930 try: 1931 response = json.loads(response) 1932 if response[0]: 1933 pid = response[1] 1934 logging.debug('start process %r for auto_update in devserver', 1935 pid) 1936 self._wait_for_auto_update_finished(pid, **kwargs) 1937 except Exception as e: 1938 logging.debug('Failed to trigger auto-update process on devserver') 1939 raised_error = e 1940 finally: 1941 return raised_error, pid 1942 1943 1944 def _parse_AU_error(self, response): 1945 """Parse auto_update error returned from devserver.""" 1946 return re.split('\n', response)[-1] 1947 1948 1949 def _classify_exceptions(self, error_list): 1950 """Parse the error that was raised from auto_update. 1951 1952 @param error_list: The list of errors (string) happened in auto-update 1953 1954 @return: A classified exception type (string) from _EXCEPTION_PATTERNS 1955 or 'Unknown exception'. Current patterns in _EXCEPTION_PATTERNS are 1956 very specific so that errors cannot match more than one pattern. 1957 """ 1958 raised_error = '' 1959 if not error_list: 1960 return raised_error 1961 else: 1962 target_error = error_list[0] 1963 1964 for err_pattern, classification in _EXCEPTION_PATTERNS: 1965 match = re.match(err_pattern, target_error) 1966 if match: 1967 return classification 1968 1969 return '(0) Unknown exception' 1970 1971 1972 def _check_error_message(self, error_patterns_to_check, error_msg): 1973 """Detect whether specific error pattern exist in error message. 1974 1975 @param error_patterns_to_check: the error patterns to check 1976 @param error_msg: the error message which may include any error 1977 pattern. 1978 1979 @return A boolean variable, True if error_msg contains any error 1980 pattern in error_patterns_to_check, False otherwise. 1981 """ 1982 for err in error_patterns_to_check: 1983 if err in error_msg: 1984 return True 1985 1986 return False 1987 1988 1989 def _is_retryable(self, error_msg): 1990 """Detect whether we will retry auto-update based on error_msg. 1991 1992 @param error_msg: The given error message. 1993 1994 @return A boolean variable which indicates whether we will retry 1995 auto_update with another devserver based on the given error_msg. 1996 """ 1997 # For now we just hard-code the error message we think it's suspicious. 1998 # When we get more date about what's the json response when devserver 1999 # is overloaded, we can update this part. 2000 retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE, 2001 'is not pingable'] 2002 return self._check_error_message(retryable_error_patterns, error_msg) 2003 2004 2005 def _should_use_original_payload(self, error_msg): 2006 devserver_error_patterns = ['DevserverCannotStartError'] 2007 return self._check_error_message(devserver_error_patterns, error_msg) 2008 2009 2010 def _parse_buildname_safely(self, build_name): 2011 """Parse a given buildname safely. 2012 2013 @param build_name: the build name to be parsed. 2014 2015 @return: a tuple (board, build_type, milestone) 2016 """ 2017 try: 2018 board, build_type, milestone, _ = server_utils.ParseBuildName( 2019 build_name) 2020 except server_utils.ParseBuildNameException: 2021 logging.warning('Unable to parse build name %s for metrics. ' 2022 'Continuing anyway.', build_name) 2023 board, build_type, milestone = ('', '', '') 2024 2025 return board, build_type, milestone 2026 2027 2028 def auto_update(self, host_name, build_name, original_board=None, 2029 original_release_version=None, log_dir=None, 2030 force_update=False, full_update=False, 2031 payload_filename=None, force_original=False, 2032 clobber_stateful=True): 2033 """Auto-update a CrOS host. 2034 2035 @param host_name: The hostname of the DUT to auto-update. 2036 @param build_name: The build name to be auto-updated on the DUT. 2037 @param original_board: The original board of the DUT to auto-update. 2038 @param original_release_version: The release version of the DUT's 2039 current build. 2040 @param log_dir: The log directory to store auto-update logs from 2041 devserver. 2042 @param force_update: Force an update even if the version installed 2043 is the same. Default: False. 2044 @param full_update: If True, do not run stateful update, directly 2045 force a full reimage. If False, try stateful 2046 update first if the dut is already installed 2047 with the same version. 2048 @param payload_filename: Used to specify the exact file to 2049 use for autoupdating. If None, the payload 2050 will be determined by build_name. You 2051 must have already staged this file before 2052 passing it in here. 2053 @param force_original: Whether to force stateful update with the 2054 original payload. 2055 @param clobber_stateful: If True do a clean install of stateful. 2056 2057 @return A set (is_success, pid) in which: 2058 1. is_success indicates whether this auto_update succeeds. 2059 2. pid is the process id of the successful autoupdate run. 2060 2061 @raise DevServerException if auto_update fails and is not retryable. 2062 @raise RetryableProvisionException if it fails and is retryable. 2063 """ 2064 kwargs = {'host_name': host_name, 2065 'build_name': build_name, 2066 'force_update': force_update, 2067 'full_update': full_update, 2068 'clobber_stateful': clobber_stateful} 2069 2070 if payload_filename is not None: 2071 kwargs['payload_filename'] = payload_filename 2072 2073 error_msg = 'CrOS auto-update failed for host %s: %s' 2074 error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s' 2075 is_au_success = False 2076 au_log_dir = os.path.join(log_dir, 2077 AUTO_UPDATE_LOG_DIR) if log_dir else None 2078 error_list = [] 2079 retry_with_another_devserver = False 2080 board, build_type, milestone = self._parse_buildname_safely(build_name) 2081 2082 for au_attempt in range(AU_RETRY_LIMIT): 2083 logging.debug('Start CrOS auto-update for host %s at %d time(s).', 2084 host_name, au_attempt + 1) 2085 # No matter _trigger_auto_update succeeds or fails, the auto-update 2086 # track_status_file should be cleaned, and the auto-update execute 2087 # log should be collected to directory sysinfo. Also, the error 2088 # raised by _trigger_auto_update should be displayed. 2089 try: 2090 # Try update with stateful.tgz of old release version in the 2091 # last try of auto-update. 2092 if force_original and original_release_version: 2093 # Monitor this case in monarch 2094 original_build = '%s/%s' % (original_board, 2095 original_release_version) 2096 c = metrics.Counter( 2097 'chromeos/autotest/provision/' 2098 'cros_update_with_original_build') 2099 f = {'dev_server': self.resolved_hostname, 2100 'board': board, 2101 'build_type': build_type, 2102 'milestone': milestone, 2103 'original_build': original_build} 2104 c.increment(fields=f) 2105 2106 logging.debug('Try updating stateful partition of the ' 2107 'host with the same version of its current ' 2108 'rootfs partition: %s', original_build) 2109 response = self._trigger_auto_update( 2110 original_build=original_build, **kwargs) 2111 else: 2112 response = self._trigger_auto_update(**kwargs) 2113 except DevServerException as e: 2114 logging.debug(error_msg_attempt, au_attempt+1, str(e)) 2115 error_list.append(str(e)) 2116 else: 2117 raised_error, pid = self.wait_for_auto_update_finished(response, 2118 **kwargs) 2119 # Error happens in _collect_au_log won't be raised. Auto-update 2120 # process will be retried. 2121 if au_log_dir: 2122 is_collect_success = self.collect_au_log( 2123 kwargs['host_name'], pid, au_log_dir) 2124 else: 2125 is_collect_success = True 2126 2127 # Error happens in _clean_track_log won't be raised. Auto-update 2128 # process will be retried. 2129 # TODO(xixuan): Change kwargs['host_name'] back to host_name 2130 # if crbug.com/651974 is fixed: host_name represents the host 2131 # name of the host, and kwargs['host_name'] could be host_name 2132 # or the IP of this host. 2133 is_clean_success = self.clean_track_log(kwargs['host_name'], 2134 pid) 2135 # If any error is raised previously, log it and retry 2136 # auto-update. Otherwise, claim a successful CrOS auto-update. 2137 if not raised_error and is_clean_success and is_collect_success: 2138 logging.debug('CrOS auto-update succeed for host %s', 2139 host_name) 2140 is_au_success = True 2141 break 2142 else: 2143 if not self.kill_au_process_for_host(kwargs['host_name'], 2144 pid): 2145 logging.debug('Failed to kill auto_update process %d', 2146 pid) 2147 if raised_error: 2148 logging.debug(error_msg_attempt, au_attempt+1, 2149 str(raised_error)) 2150 if au_log_dir: 2151 logging.debug('Please see error details in log %s', 2152 self._get_au_log_filename( 2153 au_log_dir, 2154 kwargs['host_name'], 2155 pid)) 2156 error_list.append(self._parse_AU_error(str(raised_error))) 2157 if self._is_retryable(str(raised_error)): 2158 retry_with_another_devserver = True 2159 2160 if self._should_use_original_payload(str(raised_error)): 2161 force_original = True 2162 2163 finally: 2164 if retry_with_another_devserver: 2165 break 2166 2167 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1: 2168 time.sleep(CROS_AU_RETRY_INTERVAL) 2169 # TODO(kevcheng): Remove this once crbug.com/651974 is 2170 # fixed. 2171 # DNS is broken in the cassandra lab, so use the IP of the 2172 # hostname instead if it fails. Not rename host_name here 2173 # for error msg reporting. 2174 host_name_ip = socket.gethostbyname(host_name) 2175 kwargs['host_name'] = host_name_ip 2176 logging.debug( 2177 'AU failed, trying IP instead of hostname: %s', 2178 host_name_ip) 2179 2180 # Note: To avoid reaching or exceeding the monarch field cardinality 2181 # limit, we avoid a metric that includes both dut hostname and other 2182 # high cardinality fields. 2183 # Per-devserver cros_update metric. 2184 c = metrics.Counter( 2185 'chromeos/autotest/provision/cros_update_by_devserver') 2186 # Add a field |error| here. Current error's pattern is manually 2187 # specified in _EXCEPTION_PATTERNS. 2188 raised_error = self._classify_exceptions(error_list) 2189 f = {'dev_server': self.resolved_hostname, 2190 'success': is_au_success, 2191 'board': board, 2192 'build_type': build_type, 2193 'milestone': milestone, 2194 'error': raised_error} 2195 c.increment(fields=f) 2196 2197 # Per-DUT cros_update metric. 2198 c = metrics.Counter('chromeos/autotest/provision/cros_update_per_dut') 2199 f = {'success': is_au_success, 2200 'board': board, 2201 'error': raised_error, 2202 'dut_host_name': host_name} 2203 c.increment(fields=f) 2204 2205 if is_au_success: 2206 return (is_au_success, pid) 2207 2208 # If errors happen in the CrOS AU process, report the first error 2209 # since the following errors might be caused by the first error. 2210 # If error happens in RPCs of cleaning track log, collecting 2211 # auto-update logs, or killing auto-update processes, just report 2212 # them together. 2213 if error_list: 2214 if retry_with_another_devserver: 2215 raise RetryableProvisionException( 2216 error_msg % (host_name, error_list[0])) 2217 else: 2218 raise DevServerException( 2219 error_msg % (host_name, error_list[0])) 2220 else: 2221 raise DevServerException(error_msg % ( 2222 host_name, ('RPC calls after the whole auto-update ' 2223 'process failed.'))) 2224 2225 2226class AndroidBuildServer(ImageServerBase): 2227 """Class for DevServer that handles RPCs related to Android builds. 2228 2229 The calls to devserver to stage artifacts, including stage and download, are 2230 made in async mode. That is, when caller makes an RPC |stage| to request 2231 devserver to stage certain artifacts, devserver handles the call and starts 2232 staging artifacts in a new thread, and return |Success| without waiting for 2233 staging being completed. When caller receives message |Success|, it polls 2234 devserver's is_staged call until all artifacts are staged. 2235 Such mechanism is designed to prevent cherrypy threads in devserver being 2236 running out, as staging artifacts might take long time, and cherrypy starts 2237 with a fixed number of threads that handle devserver rpc. 2238 """ 2239 2240 def wait_for_artifacts_staged(self, target, build_id, branch, 2241 archive_url=None, artifacts='', files=''): 2242 """Polling devserver.is_staged until all artifacts are staged. 2243 2244 @param target: Target of the android build to stage, e.g., 2245 shamu-userdebug. 2246 @param build_id: Build id of the android build to stage. 2247 @param branch: Branch of the android build to stage. 2248 @param archive_url: Google Storage URL for the build. 2249 @param artifacts: Comma separated list of artifacts to download. 2250 @param files: Comma separated list of files to download. 2251 2252 @return: True if all artifacts are staged in devserver. 2253 """ 2254 kwargs = {'target': target, 2255 'build_id': build_id, 2256 'branch': branch, 2257 'artifacts': artifacts, 2258 'files': files, 2259 'os_type': 'android'} 2260 if archive_url: 2261 kwargs['archive_url'] = archive_url 2262 return self._poll_is_staged(**kwargs) 2263 2264 2265 @remote_devserver_call() 2266 def call_and_wait(self, call_name, target, build_id, branch, archive_url, 2267 artifacts, files, error_message, 2268 expected_response=SUCCESS): 2269 """Helper method to make a urlopen call, and wait for artifacts staged. 2270 2271 @param call_name: name of devserver rpc call. 2272 @param target: Target of the android build to stage, e.g., 2273 shamu-userdebug. 2274 @param build_id: Build id of the android build to stage. 2275 @param branch: Branch of the android build to stage. 2276 @param archive_url: Google Storage URL for the CrOS build. 2277 @param artifacts: Comma separated list of artifacts to download. 2278 @param files: Comma separated list of files to download. 2279 @param expected_response: Expected response from rpc, default to 2280 |Success|. If it's set to None, do not compare 2281 the actual response. Any response is consider 2282 to be good. 2283 @param error_message: Error message to be thrown if response does not 2284 match expected_response. 2285 2286 @return: The response from rpc. 2287 @raise DevServerException upon any return code that's expected_response. 2288 2289 """ 2290 kwargs = {'target': target, 2291 'build_id': build_id, 2292 'branch': branch, 2293 'artifacts': artifacts, 2294 'files': files, 2295 'os_type': 'android'} 2296 if archive_url: 2297 kwargs['archive_url'] = archive_url 2298 return self._call_and_wait(call_name, error_message, expected_response, 2299 **kwargs) 2300 2301 2302 @remote_devserver_call() 2303 def stage_artifacts(self, target=None, build_id=None, branch=None, 2304 image=None, artifacts=None, files='', archive_url=None): 2305 """Tell the devserver to download and stage |artifacts| from |image|. 2306 2307 This is the main call point for staging any specific artifacts for a 2308 given build. To see the list of artifacts one can stage see: 2309 2310 ~src/platfrom/dev/artifact_info.py. 2311 2312 This is maintained along with the actual devserver code. 2313 2314 @param target: Target of the android build to stage, e.g., 2315 shamu-userdebug. 2316 @param build_id: Build id of the android build to stage. 2317 @param branch: Branch of the android build to stage. 2318 @param image: Name of a build to test, in the format of 2319 branch/target/build_id 2320 @param artifacts: A list of artifacts. 2321 @param files: A list of files to stage. 2322 @param archive_url: Optional parameter that has the archive_url to stage 2323 this artifact from. Default is specified in autotest config + 2324 image. 2325 2326 @raise DevServerException upon any return code that's not HTTP OK. 2327 """ 2328 if image and not target and not build_id and not branch: 2329 branch, target, build_id = utils.parse_launch_control_build(image) 2330 if not target or not build_id or not branch: 2331 raise DevServerException('Must specify all build info (target, ' 2332 'build_id and branch) to stage.') 2333 2334 android_build_info = {'target': target, 2335 'build_id': build_id, 2336 'branch': branch} 2337 if not artifacts and not files: 2338 raise DevServerException('Must specify something to stage.') 2339 if not all(android_build_info.values()): 2340 raise DevServerException( 2341 'To stage an Android build, must specify target, build id ' 2342 'and branch.') 2343 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2344 self._stage_artifacts(build, artifacts, files, archive_url, 2345 **android_build_info) 2346 2347 def get_pull_url(self, target, build_id, branch): 2348 """Get the url to pull files from the devserver. 2349 2350 @param target: Target of the android build, e.g., shamu_userdebug 2351 @param build_id: Build id of the android build. 2352 @param branch: Branch of the android build. 2353 2354 @return A url to pull files from the dev server given a specific 2355 android build. 2356 """ 2357 return os.path.join(self.url(), 'static', branch, target, build_id) 2358 2359 2360 def trigger_download(self, target, build_id, branch, artifacts=None, 2361 files='', os='android', synchronous=True): 2362 """Tell the devserver to download and stage an Android build. 2363 2364 Tells the devserver to fetch an Android build from the image storage 2365 server named by _get_image_storage_server(). 2366 2367 If |synchronous| is True, waits for the entire download to finish 2368 staging before returning. Otherwise only the artifacts necessary 2369 to start installing images onto DUT's will be staged before returning. 2370 A caller can then call finish_download to guarantee the rest of the 2371 artifacts have finished staging. 2372 2373 @param target: Target of the android build to stage, e.g., 2374 shamu-userdebug. 2375 @param build_id: Build id of the android build to stage. 2376 @param branch: Branch of the android build to stage. 2377 @param artifacts: A string of artifacts separated by comma. If None, 2378 use the default artifacts for Android or Brillo build. 2379 @param files: String of file seperated by commas. 2380 @param os: OS artifacts to download (android/brillo). 2381 @param synchronous: if True, waits until all components of the image are 2382 staged before returning. 2383 2384 @raise DevServerException upon any return code that's not HTTP OK. 2385 2386 """ 2387 android_build_info = {'target': target, 2388 'build_id': build_id, 2389 'branch': branch} 2390 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2391 if not artifacts: 2392 board = target.split('-')[0] 2393 artifacts = ( 2394 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2395 board, os)) 2396 self._trigger_download(build, artifacts, files=files, 2397 synchronous=synchronous, **android_build_info) 2398 2399 2400 def finish_download(self, target, build_id, branch, os='android'): 2401 """Tell the devserver to finish staging an Android build. 2402 2403 If trigger_download is called with synchronous=False, it will return 2404 before all artifacts have been staged. This method contacts the 2405 devserver and blocks until all staging is completed and should be 2406 called after a call to trigger_download. 2407 2408 @param target: Target of the android build to stage, e.g., 2409 shamu-userdebug. 2410 @param build_id: Build id of the android build to stage. 2411 @param branch: Branch of the android build to stage. 2412 @param os: OS artifacts to download (android/brillo). 2413 2414 @raise DevServerException upon any return code that's not HTTP OK. 2415 """ 2416 android_build_info = {'target': target, 2417 'build_id': build_id, 2418 'branch': branch} 2419 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2420 board = target.split('-')[0] 2421 artifacts = ( 2422 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2423 board)) 2424 self._finish_download(build, artifacts, files='', **android_build_info) 2425 2426 2427 def get_staged_file_url(self, filename, target, build_id, branch): 2428 """Returns the url of a staged file for this image on the devserver. 2429 2430 @param filename: Name of the file. 2431 @param target: Target of the android build to stage, e.g., 2432 shamu-userdebug. 2433 @param build_id: Build id of the android build to stage. 2434 @param branch: Branch of the android build to stage. 2435 2436 @return: The url of a staged file for this image on the devserver. 2437 """ 2438 android_build_info = {'target': target, 2439 'build_id': build_id, 2440 'branch': branch, 2441 'os_type': 'android'} 2442 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2443 return '/'.join([self._get_image_url(build), filename]) 2444 2445 2446 @remote_devserver_call() 2447 def translate(self, build_name): 2448 """Translate the build name if it's in LATEST format. 2449 2450 If the build name is in the format [branch]/[target]/LATEST, return the 2451 latest build in Launch Control otherwise return the build name as is. 2452 2453 @param build_name: build_name to check. 2454 2455 @return The actual build name to use. 2456 """ 2457 branch, target, build_id = utils.parse_launch_control_build(build_name) 2458 if build_id.upper() != 'LATEST': 2459 return build_name 2460 call = self.build_call('latestbuild', branch=branch, target=target, 2461 os_type='android') 2462 translated_build_id = self.run_call(call) 2463 translated_build = (ANDROID_BUILD_NAME_PATTERN % 2464 {'branch': branch, 2465 'target': target, 2466 'build_id': translated_build_id}) 2467 logging.debug('Translated relative build %s to %s', build_name, 2468 translated_build) 2469 return translated_build 2470 2471 2472def _is_load_healthy(load): 2473 """Check if devserver's load meets the minimum threshold. 2474 2475 @param load: The devserver's load stats to check. 2476 2477 @return: True if the load meets the minimum threshold. Return False 2478 otherwise. 2479 2480 """ 2481 # Threshold checks, including CPU load. 2482 if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD: 2483 logging.debug('CPU load of devserver %s is at %s%%, which is higher ' 2484 'than the threshold of %s%%', load['devserver'], 2485 load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD) 2486 return False 2487 if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO: 2488 logging.debug('Network IO of devserver %s is at %i Bps, which is ' 2489 'higher than the threshold of %i bytes per second.', 2490 load['devserver'], load[DevServer.NETWORK_IO], 2491 DevServer.MAX_NETWORK_IO) 2492 return False 2493 return True 2494 2495 2496def _compare_load(devserver1, devserver2): 2497 """Comparator function to compare load between two devservers. 2498 2499 @param devserver1: A dictionary of devserver load stats to be compared. 2500 @param devserver2: A dictionary of devserver load stats to be compared. 2501 2502 @return: Negative value if the load of `devserver1` is less than the load 2503 of `devserver2`. Return positive value otherwise. 2504 2505 """ 2506 return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO]) 2507 2508 2509def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None): 2510 """Get the devserver with the least load. 2511 2512 Iterate through all devservers and get the one with least load. 2513 2514 TODO(crbug.com/486278): Devserver with required build already staged should 2515 take higher priority. This will need check_health call to be able to verify 2516 existence of a given build/artifact. Also, in case all devservers are 2517 overloaded, the logic here should fall back to the old behavior that randomly 2518 selects a devserver based on the hash of the image name/url. 2519 2520 @param devserver_type: Type of devserver to select from. Default is set to 2521 ImageServer. 2522 @param hostname: Hostname of the dut that the devserver is used for. The 2523 picked devserver needs to respect the location of the host if 2524 `prefer_local_devserver` is set to True or `restricted_subnets` is 2525 set. 2526 2527 @return: Name of the devserver with the least load. 2528 2529 """ 2530 logging.debug('Get the least loaded %r', devserver_type) 2531 devservers, can_retry = devserver_type.get_available_devservers( 2532 hostname) 2533 # If no healthy devservers available and can_retry is False, return None. 2534 # Otherwise, relax the constrain on hostname, allow all devservers to be 2535 # available. 2536 if not devserver_type.get_healthy_devserver('', devservers): 2537 if not can_retry: 2538 return None 2539 else: 2540 devservers, _ = devserver_type.get_available_devservers() 2541 2542 # get_devserver_load call needs to be made in a new process to allow force 2543 # timeout using signal. 2544 output = multiprocessing.Queue() 2545 processes = [] 2546 for devserver in devservers: 2547 processes.append(multiprocessing.Process( 2548 target=devserver_type.get_devserver_load_wrapper, 2549 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output))) 2550 2551 for p in processes: 2552 p.start() 2553 for p in processes: 2554 p.join() 2555 loads = [output.get() for p in processes] 2556 # Filter out any load failed to be retrieved or does not support load check. 2557 loads = [load for load in loads if load and DevServer.CPU_LOAD in load and 2558 DevServer.is_free_disk_ok(load) and 2559 DevServer.is_apache_client_count_ok(load)] 2560 if not loads: 2561 logging.debug('Failed to retrieve load stats from any devserver. No ' 2562 'load balancing can be applied.') 2563 return None 2564 loads = [load for load in loads if _is_load_healthy(load)] 2565 if not loads: 2566 logging.error('No devserver has the capacity to be selected.') 2567 return None 2568 loads = sorted(loads, cmp=_compare_load) 2569 return loads[0]['devserver'] 2570 2571 2572def resolve(build, hostname=None, ban_list=None): 2573 """Resolve a devserver can be used for given build and hostname. 2574 2575 @param build: Name of a build to stage on devserver, e.g., 2576 ChromeOS build: daisy-release/R50-1234.0.0 2577 Launch Control build: git_mnc_release/shamu-eng 2578 @param hostname: Hostname of a devserver for, default is None, which means 2579 devserver is not restricted by the network location of the host. 2580 @param ban_list: The blacklist of devservers shouldn't be chosen. 2581 2582 @return: A DevServer instance that can be used to stage given build for the 2583 given host. 2584 """ 2585 if utils.is_launch_control_build(build): 2586 return AndroidBuildServer.resolve(build, hostname) 2587 else: 2588 return ImageServer.resolve(build, hostname, ban_list=ban_list) 2589