1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from distutils import version 6import cStringIO 7import HTMLParser 8import httplib 9import json 10import logging 11import multiprocessing 12import os 13import re 14import socket 15import time 16import urllib2 17import urlparse 18 19from autotest_lib.client.bin import utils as bin_utils 20from autotest_lib.client.common_lib import android_utils 21from autotest_lib.client.common_lib import error 22from autotest_lib.client.common_lib import global_config 23from autotest_lib.client.common_lib import utils 24from autotest_lib.client.common_lib.cros import retry 25from autotest_lib.server import utils as server_utils 26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107 27 28try: 29 from chromite.lib import metrics 30except ImportError: 31 metrics = utils.metrics_mock 32 33 34CONFIG = global_config.global_config 35# This file is generated at build time and specifies, per suite and per test, 36# the DEPENDENCIES list specified in each control file. It's a dict of dicts: 37# {'bvt': {'/path/to/autotest/control/site_tests/test1/control': ['dep1']} 38# 'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']} 39# 'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'], 40# '/path/to/autotest/control/site_tests/test3/control': ['dep3']} 41# } 42DEPENDENCIES_FILE = 'test_suites/dependency_info' 43# Number of seconds for caller to poll devserver's is_staged call to check if 44# artifacts are staged. 45_ARTIFACT_STAGE_POLLING_INTERVAL = 5 46# Artifacts that should be staged when client calls devserver RPC to stage an 47# image. 48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful' 49# Artifacts that should be staged when client calls devserver RPC to stage an 50# image with autotest artifact. 51_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,' 52 'control_files,stateful,' 53 'autotest_packages') 54# Artifacts that should be staged when client calls devserver RPC to stage an 55# Android build. 56_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions') 57SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value( 58 'CROS', 'skip_devserver_health_check', type=bool) 59# Number of seconds for the call to get devserver load to time out. 60TIMEOUT_GET_DEVSERVER_LOAD = 2.0 61 62# Android artifact path in devserver 63ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value( 64 'CROS', 'android_build_name_pattern', type=str).replace('\\', '') 65 66# Return value from a devserver RPC indicating the call succeeded. 67SUCCESS = 'Success' 68 69# The timeout minutes for a given devserver ssh call. 70DEVSERVER_SSH_TIMEOUT_MINS = 1 71 72# Error message for invalid devserver response. 73ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error' 74ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable' 75 76# Error message for devserver call timedout. 77ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout' 78 79# The timeout minutes for waiting a devserver staging. 80DEVSERVER_IS_STAGING_RETRY_MIN = 100 81 82# The timeout minutes for waiting a DUT auto-update finished. 83DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100 84 85# The total times of devserver triggering CrOS auto-update. 86AU_RETRY_LIMIT = 2 87 88# Number of seconds for caller to poll devserver's get_au_status call to 89# check if cros auto-update is finished. 90CROS_AU_POLLING_INTERVAL = 10 91 92# Number of seconds for intervals between retrying auto-update calls. 93CROS_AU_RETRY_INTERVAL = 20 94 95# The file name for auto-update logs. 96CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log' 97 98# Provision error patterns. 99# People who see this should know that they shouldn't change these 100# classification strings. These strings are used for monitoring provision 101# failures. Any changes may mess up the stats. 102_EXCEPTION_PATTERNS = [ 103 # Raised when devserver portfile does not exist on host. 104 (r".*Devserver portfile does not exist!.*$", 105 '(1) Devserver portfile does not exist on host'), 106 # Raised when devserver cannot copy packages to host. 107 (r".*Could not copy .* to device.*$", 108 '(2) Cannot copy packages to host'), 109 # Raised when devserver fails to run specific commands on host. 110 (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$", 111 '(3) Fail to run specific command on host'), 112 # Raised when new build fails to boot on the host. 113 (r'.*RootfsUpdateError: Build .* failed to boot on.*$', 114 '(4) Build failed to boot on host'), 115 # Raised when the auto-update process is timed out. 116 (r'.*The CrOS auto-update process is timed out, ' 117 'thus will be terminated.*$', 118 '(5) Auto-update is timed out'), 119 # Raised when the host is not pingable. 120 (r".*DeviceNotPingableError.*$", 121 '(6) Host is not pingable during auto-update'), 122 # Raised when hosts have unexpected status after rootfs update. 123 (r'.*Update failed with unexpected update status: ' 124 'UPDATE_STATUS_IDLE.*$', 125 '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs ' 126 'update'), 127 # Raised when devserver returns non-json response to shard/drone. 128 (r'.*No JSON object could be decoded.*$', 129 '(8) Devserver returned non-json object'), 130 # Raised when devserver loses host's ssh connection 131 (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$', 132 "(9) Devserver lost host's ssh connection"), 133 # Raised when error happens in writing files to host 134 (r'.*Write failed\: Broken pipe.*$', 135 "(10) Broken pipe while writing or connecting to host")] 136 137PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value( 138 'CROS', 'prefer_local_devserver', type=bool, default=False) 139 140ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value( 141 'CROS', 'enable_ssh_connection_for_devserver', type=bool, 142 default=False) 143 144# Directory to save auto-update logs 145AUTO_UPDATE_LOG_DIR = 'autoupdate_logs' 146 147DEFAULT_SUBNET_MASKBIT = 19 148 149# Metrics basepaths. 150METRICS_PATH = 'chromeos/autotest' 151PROVISION_PATH = METRICS_PATH + '/provision' 152 153 154class DevServerException(Exception): 155 """Raised when the dev server returns a non-200 HTTP response.""" 156 pass 157 158 159class BadBuildException(DevServerException): 160 """Raised when build failed to boot on DUT.""" 161 pass 162 163 164class RetryableProvisionException(DevServerException): 165 """Raised when provision fails due to a retryable reason.""" 166 pass 167 168class DevServerOverloadException(Exception): 169 """Raised when the dev server returns a 502 HTTP response.""" 170 pass 171 172class DevServerFailToLocateException(Exception): 173 """Raised when fail to locate any devserver.""" 174 pass 175 176 177class DevServerExceptionClassifier(object): 178 """A Class represents exceptions raised from DUT by calling auto_update.""" 179 def __init__(self, err, keep_full_trace=True): 180 """ 181 @param err: A single string representing one time provision 182 error happened in auto_update(). 183 @param keep_full_trace: True to keep the whole track trace of error. 184 False when just keep the last line. 185 """ 186 self._err = err if keep_full_trace else err.split('\n')[-1] 187 self._classification = None 188 189 def _classify(self): 190 for err_pattern, classification in _EXCEPTION_PATTERNS: 191 if re.match(err_pattern, self._err): 192 return classification 193 194 return '(0) Unknown exception' 195 196 @property 197 def classification(self): 198 """Classify the error 199 200 @return: return a classified exception type (string) from 201 _EXCEPTION_PATTERNS or 'Unknown exception'. Current patterns in 202 _EXCEPTION_PATTERNS are very specific so that errors cannot match 203 more than one pattern. 204 """ 205 if not self._classification: 206 self._classification = self._classify() 207 return self._classification 208 209 @property 210 def summary(self): 211 """Use one line to show the error message.""" 212 return ' '.join(self._err.splitlines()) 213 214 @property 215 def classified_exception(self): 216 """What kind of exception will be raised to higher. 217 218 @return: return a special Exception when the raised error is an 219 RootfsUpdateError. Otherwise, return general DevServerException. 220 """ 221 # The classification of RootfsUpdateError in _EXCEPTION_PATTERNS starts 222 # with "(4)" 223 if self.classification.startswith('(4)'): 224 return BadBuildException 225 226 return DevServerException 227 228 229class MarkupStripper(HTMLParser.HTMLParser): 230 """HTML parser that strips HTML tags, coded characters like & 231 232 Works by, basically, not doing anything for any tags, and only recording 233 the content of text nodes in an internal data structure. 234 """ 235 def __init__(self): 236 self.reset() 237 self.fed = [] 238 239 240 def handle_data(self, d): 241 """Consume content of text nodes, store it away.""" 242 self.fed.append(d) 243 244 245 def get_data(self): 246 """Concatenate and return all stored data.""" 247 return ''.join(self.fed) 248 249 250def _strip_http_message(message): 251 """Strip the HTTP marker from the an HTTP message. 252 253 @param message: A string returned by an HTTP call. 254 255 @return: A string with HTTP marker being stripped. 256 """ 257 strip = MarkupStripper() 258 try: 259 strip.feed(message.decode('utf_32')) 260 except UnicodeDecodeError: 261 strip.feed(message) 262 return strip.get_data() 263 264 265def _get_image_storage_server(): 266 return CONFIG.get_config_value('CROS', 'image_storage_server', type=str) 267 268 269def _get_canary_channel_server(): 270 """ 271 Get the url of the canary-channel server, 272 eg: gsutil://chromeos-releases/canary-channel/<board>/<release> 273 274 @return: The url to the canary channel server. 275 """ 276 return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str) 277 278 279def _get_storage_server_for_artifacts(artifacts=None): 280 """Gets the appropriate storage server for the given artifacts. 281 282 @param artifacts: A list of artifacts we need to stage. 283 @return: The address of the storage server that has these artifacts. 284 The default image storage server if no artifacts are specified. 285 """ 286 factory_artifact = global_config.global_config.get_config_value( 287 'CROS', 'factory_artifact', type=str, default='') 288 if artifacts and factory_artifact and factory_artifact in artifacts: 289 return _get_canary_channel_server() 290 return _get_image_storage_server() 291 292 293def _gs_or_local_archive_url_args(archive_url): 294 """Infer the devserver call arguments to use with the given archive_url. 295 296 @param archive_url: The archive url to include the in devserver RPC. This 297 can either e a GS path or a local path. 298 @return: A dict of arguments to include in the devserver call. 299 """ 300 if not archive_url: 301 return {} 302 elif archive_url.startswith('gs://'): 303 return {'archive_url': archive_url} 304 else: 305 # For a local path, we direct the devserver to move the files while 306 # staging. This is the fastest way to stage local files, but deletes the 307 # files from the source. This is OK because the files are available on 308 # the devserver once staged. 309 return { 310 'local_path': archive_url, 311 'delete_source': True, 312 } 313 314 315def _reverse_lookup_from_config(address): 316 """Look up hostname for the given IP address. 317 318 This uses the hostname-address map from the config file. 319 320 If multiple hostnames map to the same IP address, the first one 321 defined in the configuration file takes precedence. 322 323 @param address: IP address string 324 @returns: hostname string, or original input if not found 325 """ 326 for hostname, addr in _get_hostname_addr_map().iteritems(): 327 if addr == address: 328 return hostname 329 return address 330 331 332def _get_hostname_addr_map(): 333 """Get hostname address mapping from config. 334 335 @return: dict mapping server hostnames to addresses 336 """ 337 return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP') 338 339 340def _get_dev_server_list(): 341 return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[]) 342 343 344def _get_crash_server_list(): 345 return CONFIG.get_config_value('CROS', 'crash_server', type=list, 346 default=[]) 347 348 349def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN, 350 exception_to_raise=DevServerException): 351 """A decorator to use with remote devserver calls. 352 353 This decorator converts urllib2.HTTPErrors into DevServerExceptions 354 with any embedded error info converted into plain text. The method 355 retries on urllib2.URLError or error.CmdError to avoid devserver flakiness. 356 """ 357 #pylint: disable=C0111 358 359 def inner_decorator(method): 360 label = method.__name__ if hasattr(method, '__name__') else None 361 def metrics_wrapper(*args, **kwargs): 362 @retry.retry((urllib2.URLError, error.CmdError, 363 DevServerOverloadException), 364 timeout_min=timeout_min, 365 exception_to_raise=exception_to_raise, 366 label=label) 367 def wrapper(): 368 """This wrapper actually catches the HTTPError.""" 369 try: 370 return method(*args, **kwargs) 371 except urllib2.HTTPError as e: 372 error_markup = e.read() 373 raise DevServerException(_strip_http_message(error_markup)) 374 375 try: 376 return wrapper() 377 except Exception as e: 378 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e): 379 dev_server = None 380 if args and isinstance(args[0], DevServer): 381 dev_server = args[0].hostname 382 elif 'devserver' in kwargs: 383 dev_server = get_hostname(kwargs['devserver']) 384 385 logging.debug('RPC call %s has timed out on devserver %s.', 386 label, dev_server) 387 c = metrics.Counter( 388 'chromeos/autotest/devserver/call_timeout') 389 c.increment(fields={'dev_server': dev_server, 390 'healthy': label}) 391 392 raise 393 394 return metrics_wrapper 395 396 return inner_decorator 397 398 399def get_hostname(url): 400 """Get the hostname portion of a URL 401 402 schema://hostname:port/path 403 404 @param url: a Url string 405 @return: a hostname string 406 """ 407 return urlparse.urlparse(url).hostname 408 409 410def get_resolved_hostname(url): 411 """Get the symbolic hostname from url. 412 413 If the given `url` uses a numeric IP address, try and find a 414 symbolic name from the hostname map in the config file. 415 416 @param url The URL with which to perform the conversion/lookup. 417 """ 418 return _reverse_lookup_from_config(get_hostname(url)) 419 420 421class DevServer(object): 422 """Base class for all DevServer-like server stubs. 423 424 This is the base class for interacting with all Dev Server-like servers. 425 A caller should instantiate a sub-class of DevServer with: 426 427 host = SubClassServer.resolve(build) 428 server = SubClassServer(host) 429 """ 430 _MIN_FREE_DISK_SPACE_GB = 20 431 _MAX_APACHE_CLIENT_COUNT = 75 432 # Threshold for the CPU load percentage for a devserver to be selected. 433 MAX_CPU_LOAD = 80.0 434 # Threshold for the network IO, set to 80MB/s 435 MAX_NETWORK_IO = 1024 * 1024 * 80 436 DISK_IO = 'disk_total_bytes_per_second' 437 NETWORK_IO = 'network_total_bytes_per_second' 438 CPU_LOAD = 'cpu_percent' 439 FREE_DISK = 'free_disk' 440 AU_PROCESS = 'au_process_count' 441 STAGING_THREAD_COUNT = 'staging_thread_count' 442 APACHE_CLIENT_COUNT = 'apache_client_count' 443 444 445 def __init__(self, devserver): 446 self._devserver = devserver 447 448 449 def url(self): 450 """Returns the url for this devserver.""" 451 return self._devserver 452 453 454 @property 455 def hostname(self): 456 """Return devserver hostname parsed from the devserver URL. 457 458 Note that this is likely parsed from the devserver URL from 459 shadow_config.ini, meaning that the "hostname" part of the 460 devserver URL is actually an IP address. 461 462 @return hostname string 463 """ 464 return get_hostname(self.url()) 465 466 467 @property 468 def resolved_hostname(self): 469 """Return devserver hostname, resolved from its IP address. 470 471 Unlike the hostname property, this property attempts to look up 472 the proper hostname from the devserver IP address. If lookup 473 fails, then fall back to whatever the hostname property would 474 have returned. 475 476 @return hostname string 477 """ 478 return _reverse_lookup_from_config(self.hostname) 479 480 481 @staticmethod 482 def get_server_url(url): 483 """Get the devserver url from a repo url, which includes build info. 484 485 @param url: A job repo url. 486 487 @return A devserver url, e.g., http://127.0.0.10:8080 488 """ 489 res = urlparse.urlparse(url) 490 if res.netloc: 491 return res.scheme + '://' + res.netloc 492 493 494 @classmethod 495 def get_devserver_load_wrapper(cls, devserver, timeout_sec, output): 496 """A wrapper function to call get_devserver_load in parallel. 497 498 @param devserver: url of the devserver. 499 @param timeout_sec: Number of seconds before time out the devserver 500 call. 501 @param output: An output queue to save results to. 502 """ 503 load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0) 504 if load: 505 load['devserver'] = devserver 506 output.put(load) 507 508 509 @classmethod 510 def get_devserver_load(cls, devserver, 511 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 512 """Returns True if the |devserver| is healthy to stage build. 513 514 @param devserver: url of the devserver. 515 @param timeout_min: How long to wait in minutes before deciding the 516 the devserver is not up (float). 517 518 @return: A dictionary of the devserver's load. 519 520 """ 521 call = cls._build_call(devserver, 'check_health') 522 @remote_devserver_call(timeout_min=timeout_min) 523 def get_load(devserver=devserver): 524 """Inner method that makes the call.""" 525 return cls.run_call(call, timeout=timeout_min*60) 526 527 try: 528 return json.load(cStringIO.StringIO(get_load(devserver=devserver))) 529 except Exception as e: 530 logging.error('Devserver call failed: "%s", timeout: %s seconds,' 531 ' Error: %s', call, timeout_min * 60, e) 532 533 534 @classmethod 535 def is_free_disk_ok(cls, load): 536 """Check if a devserver has enough free disk. 537 538 @param load: A dict of the load of the devserver. 539 540 @return: True if the devserver has enough free disk or disk check is 541 skipped in global config. 542 543 """ 544 if SKIP_DEVSERVER_HEALTH_CHECK: 545 logging.debug('devserver health check is skipped.') 546 elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB: 547 return False 548 549 return True 550 551 552 @classmethod 553 def is_apache_client_count_ok(cls, load): 554 """Check if a devserver has enough Apache connections available. 555 556 Apache server by default has maximum of 150 concurrent connections. If 557 a devserver has too many live connections, it likely indicates the 558 server is busy handling many long running download requests, e.g., 559 downloading stateful partitions. It is better not to add more requests 560 to it. 561 562 @param load: A dict of the load of the devserver. 563 564 @return: True if the devserver has enough Apache connections available, 565 or disk check is skipped in global config. 566 567 """ 568 if SKIP_DEVSERVER_HEALTH_CHECK: 569 logging.debug('devserver health check is skipped.') 570 elif cls.APACHE_CLIENT_COUNT not in load: 571 logging.debug('Apache client count is not collected from devserver.') 572 elif (load[cls.APACHE_CLIENT_COUNT] > 573 cls._MAX_APACHE_CLIENT_COUNT): 574 return False 575 576 return True 577 578 579 @classmethod 580 def devserver_healthy(cls, devserver, 581 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 582 """Returns True if the |devserver| is healthy to stage build. 583 584 @param devserver: url of the devserver. 585 @param timeout_min: How long to wait in minutes before deciding the 586 the devserver is not up (float). 587 588 @return: True if devserver is healthy. Return False otherwise. 589 590 """ 591 c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy') 592 reason = '' 593 healthy = False 594 load = cls.get_devserver_load(devserver, timeout_min=timeout_min) 595 try: 596 if not load: 597 # Failed to get the load of devserver. 598 reason = '(1) Failed to get load.' 599 return False 600 601 apache_ok = cls.is_apache_client_count_ok(load) 602 if not apache_ok: 603 reason = '(2) Apache client count too high.' 604 logging.error('Devserver check_health failed. Live Apache client ' 605 'count is too high: %d.', 606 load[cls.APACHE_CLIENT_COUNT]) 607 return False 608 609 disk_ok = cls.is_free_disk_ok(load) 610 if not disk_ok: 611 reason = '(3) Disk space too low.' 612 logging.error('Devserver check_health failed. Free disk space is ' 613 'low. Only %dGB is available.', 614 load[cls.FREE_DISK]) 615 healthy = bool(disk_ok) 616 return disk_ok 617 finally: 618 c.increment(fields={'dev_server': cls(devserver).resolved_hostname, 619 'healthy': healthy, 620 'reason': reason}) 621 # Monitor how many AU processes the devserver is currently running. 622 if load is not None and load.get(DevServer.AU_PROCESS): 623 c_au = metrics.Gauge( 624 'chromeos/autotest/devserver/devserver_au_count') 625 c_au.set( 626 load.get(DevServer.AU_PROCESS), 627 fields={'dev_server': cls(devserver).resolved_hostname}) 628 629 630 @staticmethod 631 def _build_call(host, method, **kwargs): 632 """Build a URL to |host| that calls |method|, passing |kwargs|. 633 634 Builds a URL that calls |method| on the dev server defined by |host|, 635 passing a set of key/value pairs built from the dict |kwargs|. 636 637 @param host: a string that is the host basename e.g. http://server:90. 638 @param method: the dev server method to call. 639 @param kwargs: a dict mapping arg names to arg values. 640 @return the URL string. 641 """ 642 # If the archive_url is a local path, the args expected by the devserver 643 # are a little different. 644 archive_url_args = _gs_or_local_archive_url_args( 645 kwargs.pop('archive_url', None)) 646 kwargs.update(archive_url_args) 647 648 argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems())) 649 return "%(host)s/%(method)s?%(argstr)s" % dict( 650 host=host, method=method, argstr=argstr) 651 652 653 def build_call(self, method, **kwargs): 654 """Builds a devserver RPC string that is used by 'run_call()'. 655 656 @param method: remote devserver method to call. 657 """ 658 return self._build_call(self._devserver, method, **kwargs) 659 660 661 @classmethod 662 def build_all_calls(cls, method, **kwargs): 663 """Builds a list of URLs that makes RPC calls on all devservers. 664 665 Build a URL that calls |method| on the dev server, passing a set 666 of key/value pairs built from the dict |kwargs|. 667 668 @param method: the dev server method to call. 669 @param kwargs: a dict mapping arg names to arg values 670 671 @return the URL string 672 """ 673 calls = [] 674 # Note we use cls.servers as servers is class specific. 675 for server in cls.servers(): 676 if cls.devserver_healthy(server): 677 calls.append(cls._build_call(server, method, **kwargs)) 678 679 return calls 680 681 682 @classmethod 683 def run_call(cls, call, readline=False, timeout=None): 684 """Invoke a given devserver call using urllib.open. 685 686 Open the URL with HTTP, and return the text of the response. Exceptions 687 may be raised as for urllib2.urlopen(). 688 689 @param call: a url string that calls a method to a devserver. 690 @param readline: whether read http response line by line. 691 @param timeout: The timeout seconds for this urlopen call. 692 693 @return the results of this call. 694 """ 695 if timeout is not None: 696 return utils.urlopen_socket_timeout( 697 call, timeout=timeout).read() 698 elif readline: 699 response = urllib2.urlopen(call) 700 return [line.rstrip() for line in response] 701 else: 702 return urllib2.urlopen(call).read() 703 704 705 @staticmethod 706 def servers(): 707 """Returns a list of servers that can serve as this type of server.""" 708 raise NotImplementedError() 709 710 711 @classmethod 712 def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT, 713 unrestricted_only=False): 714 """Get the devservers in the same subnet of the given ip. 715 716 @param ip: The IP address of a dut to look for devserver. 717 @param mask_bits: Number of mask bits. Default is 19. 718 @param unrestricted_only: Set to True to select from devserver in 719 unrestricted subnet only. Default is False. 720 721 @return: A list of devservers in the same subnet of the given ip. 722 723 """ 724 # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so 725 # we need a dict to return the full devserver path once the IPs are 726 # filtered in get_servers_in_same_subnet. 727 server_names = {} 728 all_devservers = [] 729 devservers = (cls.get_unrestricted_devservers() if unrestricted_only 730 else cls.servers()) 731 for server in devservers: 732 server_name = get_hostname(server) 733 server_names[server_name] = server 734 all_devservers.append(server_name) 735 if not all_devservers: 736 devserver_type = 'unrestricted only' if unrestricted_only else 'all' 737 raise DevServerFailToLocateException( 738 'Fail to locate a devserver for dut %s in %s devservers' 739 % (ip, devserver_type)) 740 741 devservers = utils.get_servers_in_same_subnet(ip, mask_bits, 742 all_devservers) 743 return [server_names[s] for s in devservers] 744 745 746 @classmethod 747 def get_unrestricted_devservers( 748 cls, restricted_subnets=utils.RESTRICTED_SUBNETS): 749 """Get the devservers not in any restricted subnet specified in 750 restricted_subnets. 751 752 @param restricted_subnets: A list of restriected subnets. 753 754 @return: A list of devservers not in any restricted subnet. 755 756 """ 757 if not restricted_subnets: 758 return cls.servers() 759 760 devservers = [] 761 for server in cls.servers(): 762 server_name = get_hostname(server) 763 if not utils.get_restricted_subnet(server_name, restricted_subnets): 764 devservers.append(server) 765 return devservers 766 767 768 @classmethod 769 def get_healthy_devserver(cls, build, devservers, ban_list=None): 770 """"Get a healthy devserver instance from the list of devservers. 771 772 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 773 @param devservers: The devserver list to be chosen out a healthy one. 774 @param ban_list: The blacklist of devservers we don't want to choose. 775 Default is None. 776 777 @return: A DevServer object of a healthy devserver. Return None if no 778 healthy devserver is found. 779 780 """ 781 logging.debug('Pick one healthy devserver from %r', devservers) 782 while devservers: 783 hash_index = hash(build) % len(devservers) 784 devserver = devservers.pop(hash_index) 785 logging.debug('Check health for %s', devserver) 786 if ban_list and devserver in ban_list: 787 continue 788 789 if cls.devserver_healthy(devserver): 790 logging.debug('Pick %s', devserver) 791 return cls(devserver) 792 793 794 @classmethod 795 def get_available_devservers(cls, hostname=None, 796 prefer_local_devserver=PREFER_LOCAL_DEVSERVER, 797 restricted_subnets=utils.RESTRICTED_SUBNETS): 798 """Get devservers in the same subnet of the given hostname. 799 800 @param hostname: Hostname of a DUT to choose devserver for. 801 802 @return: A tuple of (devservers, can_retry), devservers is a list of 803 devservers that's available for the given hostname. can_retry 804 is a flag that indicate if caller can retry the selection of 805 devserver if no devserver in the returned devservers can be 806 used. For example, if hostname is in a restricted subnet, 807 can_retry will be False. 808 """ 809 logging.info('Getting devservers for host: %s', hostname) 810 host_ip = None 811 if hostname: 812 host_ip = bin_utils.get_ip_address(hostname) 813 if not host_ip: 814 logging.error('Failed to get IP address of %s. Will pick a ' 815 'devserver without subnet constraint.', hostname) 816 817 if not host_ip: 818 return cls.get_unrestricted_devservers(restricted_subnets), False 819 820 # Go through all restricted subnet settings and check if the DUT is 821 # inside a restricted subnet. If so, only return the devservers in the 822 # restricted subnet and doesn't allow retry. 823 if host_ip and restricted_subnets: 824 subnet_ip, mask_bits = _get_subnet_for_host_ip( 825 host_ip, restricted_subnets=restricted_subnets) 826 if subnet_ip: 827 logging.debug('The host %s (%s) is in a restricted subnet. ' 828 'Try to locate a devserver inside subnet ' 829 '%s:%d.', hostname, host_ip, subnet_ip, 830 mask_bits) 831 devservers = cls.get_devservers_in_same_subnet( 832 subnet_ip, mask_bits) 833 return devservers, False 834 835 # If prefer_local_devserver is set to True and the host is not in 836 # restricted subnet, pick a devserver in the same subnet if possible. 837 # Set can_retry to True so it can pick a different devserver if all 838 # devservers in the same subnet are down. 839 if prefer_local_devserver: 840 return (cls.get_devservers_in_same_subnet( 841 host_ip, DEFAULT_SUBNET_MASKBIT, True), True) 842 843 return cls.get_unrestricted_devservers(restricted_subnets), False 844 845 846 @classmethod 847 def resolve(cls, build, hostname=None, ban_list=None): 848 """"Resolves a build to a devserver instance. 849 850 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 851 @param hostname: The hostname of dut that requests a devserver. It's 852 used to make sure a devserver in the same subnet is 853 preferred. 854 @param ban_list: The blacklist of devservers shouldn't be chosen. 855 856 @raise DevServerException: If no devserver is available. 857 """ 858 tried_devservers = set() 859 devservers, can_retry = cls.get_available_devservers(hostname) 860 if devservers: 861 tried_devservers |= set(devservers) 862 863 devserver = cls.get_healthy_devserver(build, devservers, 864 ban_list=ban_list) 865 866 if not devserver and can_retry: 867 # Find available devservers without dut location constrain. 868 devservers, _ = cls.get_available_devservers() 869 devserver = cls.get_healthy_devserver(build, devservers, 870 ban_list=ban_list) 871 if devservers: 872 tried_devservers |= set(devservers) 873 if devserver: 874 return devserver 875 else: 876 subnet = 'unrestricted subnet' 877 if hostname is not None: 878 host_ip = bin_utils.get_ip_address(hostname) 879 if host_ip: 880 subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip) 881 subnet = '%s/%s' % (str(subnet_ip), str(mask_bits)) 882 883 error_msg = ('All devservers in subnet: %s are currently down: ' 884 '%s. (dut hostname: %s)' % 885 (subnet, tried_devservers, hostname)) 886 logging.error(error_msg) 887 c = metrics.Counter( 888 'chromeos/autotest/devserver/subnet_without_devservers') 889 c.increment(fields={'subnet': subnet, 'hostname': str(hostname)}) 890 raise DevServerException(error_msg) 891 892 893 @classmethod 894 def random(cls): 895 """Return a random devserver that's available. 896 897 Devserver election in `resolve` method is based on a hash of the 898 build that a caller wants to stage. The purpose is that different 899 callers requesting for the same build can get the same devserver, 900 while the lab is able to distribute different builds across all 901 devservers. That helps to reduce the duplication of builds across 902 all devservers. 903 This function returns a random devserver, by passing a random 904 pseudo build name to `resolve `method. 905 """ 906 return cls.resolve(build=str(time.time())) 907 908 909class CrashServer(DevServer): 910 """Class of DevServer that symbolicates crash dumps.""" 911 912 @staticmethod 913 def servers(): 914 return _get_crash_server_list() 915 916 917 @remote_devserver_call() 918 def symbolicate_dump(self, minidump_path, build): 919 """Ask the devserver to symbolicate the dump at minidump_path. 920 921 Stage the debug symbols for |build| and, if that works, ask the 922 devserver to symbolicate the dump at |minidump_path|. 923 924 @param minidump_path: the on-disk path of the minidump. 925 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 926 whose debug symbols are needed for symbolication. 927 @return The contents of the stack trace 928 @raise DevServerException upon any return code that's not HTTP OK. 929 """ 930 try: 931 import requests 932 except ImportError: 933 logging.warning("Can't 'import requests' to connect to dev server.") 934 return '' 935 f = {'dev_server': self.resolved_hostname} 936 c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump') 937 c.increment(fields=f) 938 # Symbolicate minidump. 939 m = 'chromeos/autotest/crashserver/symbolicate_dump_duration' 940 with metrics.SecondsTimer(m, fields=f): 941 call = self.build_call('symbolicate_dump', 942 archive_url=_get_image_storage_server() + build) 943 request = requests.post( 944 call, files={'minidump': open(minidump_path, 'rb')}) 945 if request.status_code == requests.codes.OK: 946 return request.text 947 948 error_fd = cStringIO.StringIO(request.text) 949 raise urllib2.HTTPError( 950 call, request.status_code, request.text, request.headers, 951 error_fd) 952 953 954 @classmethod 955 def get_available_devservers(cls, hostname): 956 """Get all available crash servers. 957 958 Crash server election doesn't need to count the location of hostname. 959 960 @param hostname: Hostname of a DUT to choose devserver for. 961 962 @return: A tuple of (all crash servers, False). can_retry is set to 963 False, as all crash servers are returned. There is no point to 964 retry. 965 """ 966 return cls.servers(), False 967 968 969class ImageServerBase(DevServer): 970 """Base class for devservers used to stage builds. 971 972 CrOS and Android builds are staged in different ways as they have different 973 sets of artifacts. This base class abstracts the shared functions between 974 the two types of ImageServer. 975 """ 976 977 @classmethod 978 def servers(cls): 979 """Returns a list of servers that can serve as a desired type of 980 devserver. 981 """ 982 return _get_dev_server_list() 983 984 985 def _get_image_url(self, image): 986 """Returns the url of the directory for this image on the devserver. 987 988 @param image: the image that was fetched. 989 """ 990 image = self.translate(image) 991 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 992 type=str) 993 return (url_pattern % (self.url(), image)).replace('update', 'static') 994 995 996 @staticmethod 997 def create_metadata(server_name, image, artifacts=None, files=None): 998 """Create a metadata dictionary given the staged items. 999 1000 The metadata can be send to metadata db along with stats. 1001 1002 @param server_name: name of the devserver, e.g 172.22.33.44. 1003 @param image: The name of the image. 1004 @param artifacts: A list of artifacts. 1005 @param files: A list of files. 1006 1007 @return A metadata dictionary. 1008 1009 """ 1010 metadata = {'devserver': server_name, 1011 'image': image, 1012 '_type': 'devserver'} 1013 if artifacts: 1014 metadata['artifacts'] = ' '.join(artifacts) 1015 if files: 1016 metadata['files'] = ' '.join(files) 1017 return metadata 1018 1019 1020 @classmethod 1021 def run_ssh_call(cls, call, readline=False, timeout=None): 1022 """Construct an ssh-based rpc call, and execute it. 1023 1024 @param call: a url string that calls a method to a devserver. 1025 @param readline: whether read http response line by line. 1026 @param timeout: The timeout seconds for ssh call. 1027 1028 @return the results of this call. 1029 """ 1030 hostname = get_hostname(call) 1031 ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call)) 1032 timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60 1033 try: 1034 result = utils.run(ssh_call, timeout=timeout_seconds) 1035 except error.CmdError as e: 1036 logging.debug('Error occurred with exit_code %d when executing the ' 1037 'ssh call: %s.', e.result_obj.exit_status, 1038 e.result_obj.stderr) 1039 c = metrics.Counter('chromeos/autotest/devserver/ssh_failure') 1040 c.increment(fields={'dev_server': hostname}) 1041 raise 1042 response = result.stdout 1043 1044 # If the curl command's returned HTTP response contains certain 1045 # exception string, raise the DevServerException of the response. 1046 if 'DownloaderException' in response: 1047 raise DevServerException(_strip_http_message(response)) 1048 1049 if readline: 1050 # Remove line terminators and trailing whitespace 1051 response = response.splitlines() 1052 return [line.rstrip() for line in response] 1053 1054 return response 1055 1056 1057 @classmethod 1058 def run_call(cls, call, readline=False, timeout=None): 1059 """Invoke a given devserver call using urllib.open or ssh. 1060 1061 Open the URL with HTTP or SSH-based HTTP, and return the text of the 1062 response. Exceptions may be raised as for urllib2.urlopen() or 1063 utils.run(). 1064 1065 @param call: a url string that calls a method to a devserver. 1066 @param readline: whether read http response line by line. 1067 @param timeout: The timeout seconds for urlopen call or ssh call. 1068 1069 @return the results of this call. 1070 """ 1071 server_name = get_hostname(call) 1072 is_in_restricted_subnet = utils.get_restricted_subnet( 1073 server_name, utils.RESTRICTED_SUBNETS) 1074 _EMPTY_SENTINEL_VALUE = object() 1075 def kickoff_call(): 1076 """Invoke a given devserver call using urllib.open or ssh. 1077 1078 @param call: a url string that calls a method to a devserver. 1079 @param is_in_restricted_subnet: whether the devserver is in subnet. 1080 @param readline: whether read http response line by line. 1081 @param timeout: The timeout seconds for urlopen call or ssh call. 1082 """ 1083 if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or 1084 not is_in_restricted_subnet): 1085 response = super(ImageServerBase, cls).run_call( 1086 call, readline=readline, timeout=timeout) 1087 else: 1088 response = cls.run_ssh_call( 1089 call, readline=readline, timeout=timeout) 1090 # Retry if devserver service is temporarily down, e.g. in a 1091 # devserver push. 1092 if ERR_MSG_FOR_DOWN_DEVSERVER in response: 1093 return False 1094 1095 # Don't return response directly since it may be empty string, 1096 # which causes poll_for_condition to retry. 1097 return _EMPTY_SENTINEL_VALUE if not response else response 1098 1099 try: 1100 response = bin_utils.poll_for_condition( 1101 kickoff_call, 1102 exception=bin_utils.TimeoutError(), 1103 timeout=60, 1104 sleep_interval=5) 1105 return '' if response is _EMPTY_SENTINEL_VALUE else response 1106 except bin_utils.TimeoutError: 1107 return ERR_MSG_FOR_DOWN_DEVSERVER 1108 1109 1110 @classmethod 1111 def download_file(cls, remote_file, local_file, timeout=None): 1112 """Download file from devserver. 1113 1114 The format of remote_file should be: 1115 http://devserver_ip:8082/static/board/... 1116 1117 @param remote_file: The URL of the file on devserver that need to be 1118 downloaded. 1119 @param local_file: The path of the file saved to local. 1120 @param timeout: The timeout seconds for this call. 1121 """ 1122 response = cls.run_call(remote_file, timeout=timeout) 1123 with open(local_file, 'w') as out_log: 1124 out_log.write(response) 1125 1126 1127 def _poll_is_staged(self, **kwargs): 1128 """Polling devserver.is_staged until all artifacts are staged. 1129 1130 @param kwargs: keyword arguments to make is_staged devserver call. 1131 1132 @return: True if all artifacts are staged in devserver. 1133 """ 1134 call = self.build_call('is_staged', **kwargs) 1135 1136 def all_staged(): 1137 """Call devserver.is_staged rpc to check if all files are staged. 1138 1139 @return: True if all artifacts are staged in devserver. False 1140 otherwise. 1141 @rasies DevServerException, the exception is a wrapper of all 1142 exceptions that were raised when devserver tried to download 1143 the artifacts. devserver raises an HTTPError or a CmdError 1144 when an exception was raised in the code. Such exception 1145 should be re-raised here to stop the caller from waiting. 1146 If the call to devserver failed for connection issue, a 1147 URLError exception is raised, and caller should retry the 1148 call to avoid such network flakiness. 1149 1150 """ 1151 try: 1152 result = self.run_call(call) 1153 logging.debug('whether artifact is staged: %r', result) 1154 return result == 'True' 1155 except urllib2.HTTPError as e: 1156 error_markup = e.read() 1157 raise DevServerException(_strip_http_message(error_markup)) 1158 except urllib2.URLError as e: 1159 # Could be connection issue, retry it. 1160 # For example: <urlopen error [Errno 111] Connection refused> 1161 logging.error('URLError happens in is_stage: %r', e) 1162 return False 1163 except error.CmdError as e: 1164 # Retry if SSH failed to connect to the devserver. 1165 logging.warning('CmdError happens in is_stage: %r, will retry', e) 1166 return False 1167 1168 bin_utils.poll_for_condition( 1169 all_staged, 1170 exception=bin_utils.TimeoutError(), 1171 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60, 1172 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL) 1173 1174 return True 1175 1176 1177 def _call_and_wait(self, call_name, error_message, 1178 expected_response=SUCCESS, **kwargs): 1179 """Helper method to make a urlopen call, and wait for artifacts staged. 1180 1181 @param call_name: name of devserver rpc call. 1182 @param error_message: Error message to be thrown if response does not 1183 match expected_response. 1184 @param expected_response: Expected response from rpc, default to 1185 |Success|. If it's set to None, do not compare 1186 the actual response. Any response is consider 1187 to be good. 1188 @param kwargs: keyword arguments to make is_staged devserver call. 1189 1190 @return: The response from rpc. 1191 @raise DevServerException upon any return code that's expected_response. 1192 1193 """ 1194 call = self.build_call(call_name, async=True, **kwargs) 1195 try: 1196 response = self.run_call(call) 1197 logging.debug('response for RPC: %r', response) 1198 if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response: 1199 logging.debug('Proxy error happens in RPC call, ' 1200 'will retry in 30 seconds') 1201 time.sleep(30) 1202 raise DevServerOverloadException() 1203 except httplib.BadStatusLine as e: 1204 logging.error(e) 1205 raise DevServerException('Received Bad Status line, Devserver %s ' 1206 'might have gone down while handling ' 1207 'the call: %s' % (self.url(), call)) 1208 1209 if expected_response and not response == expected_response: 1210 raise DevServerException(error_message) 1211 1212 # `os_type` is needed in build a devserver call, but not needed for 1213 # wait_for_artifacts_staged, since that method is implemented by 1214 # each ImageServerBase child class. 1215 if 'os_type' in kwargs: 1216 del kwargs['os_type'] 1217 self.wait_for_artifacts_staged(**kwargs) 1218 return response 1219 1220 1221 def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs): 1222 """Tell the devserver to download and stage |artifacts| from |image| 1223 specified by kwargs. 1224 1225 This is the main call point for staging any specific artifacts for a 1226 given build. To see the list of artifacts one can stage see: 1227 1228 ~src/platfrom/dev/artifact_info.py. 1229 1230 This is maintained along with the actual devserver code. 1231 1232 @param artifacts: A list of artifacts. 1233 @param files: A list of files to stage. 1234 @param archive_url: Optional parameter that has the archive_url to stage 1235 this artifact from. Default is specified in autotest config + 1236 image. 1237 @param kwargs: keyword arguments that specify the build information, to 1238 make stage devserver call. 1239 1240 @raise DevServerException upon any return code that's not HTTP OK. 1241 """ 1242 if not archive_url: 1243 archive_url = _get_storage_server_for_artifacts(artifacts) + build 1244 1245 artifacts_arg = ','.join(artifacts) if artifacts else '' 1246 files_arg = ','.join(files) if files else '' 1247 error_message = ("staging %s for %s failed;" 1248 "HTTP OK not accompanied by 'Success'." % 1249 ('artifacts=%s files=%s ' % (artifacts_arg, files_arg), 1250 build)) 1251 1252 staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' % 1253 (build, artifacts, files, archive_url)) 1254 logging.info('Staging artifacts on devserver %s: %s', 1255 self.url(), staging_info) 1256 success = False 1257 try: 1258 arguments = {'archive_url': archive_url, 1259 'artifacts': artifacts_arg, 1260 'files': files_arg} 1261 if kwargs: 1262 arguments.update(kwargs) 1263 # TODO(akeshet): canonicalize artifacts_arg before using it as a 1264 # metric field (as it stands it is a not-very-well-controlled 1265 # string). 1266 f = {'artifacts': artifacts_arg, 1267 'dev_server': self.resolved_hostname} 1268 with metrics.SecondsTimer( 1269 'chromeos/autotest/devserver/stage_artifact_duration', 1270 fields=f): 1271 self.call_and_wait(call_name='stage', error_message=error_message, 1272 **arguments) 1273 logging.info('Finished staging artifacts: %s', staging_info) 1274 success = True 1275 except (bin_utils.TimeoutError, error.TimeoutException): 1276 logging.error('stage_artifacts timed out: %s', staging_info) 1277 raise DevServerException( 1278 'stage_artifacts timed out: %s' % staging_info) 1279 finally: 1280 f = {'success': success, 1281 'artifacts': artifacts_arg, 1282 'dev_server': self.resolved_hostname} 1283 metrics.Counter('chromeos/autotest/devserver/stage_artifact' 1284 ).increment(fields=f) 1285 1286 1287 def call_and_wait(self, *args, **kwargs): 1288 """Helper method to make a urlopen call, and wait for artifacts staged. 1289 1290 This method needs to be overridden in the subclass to implement the 1291 logic to call _call_and_wait. 1292 """ 1293 raise NotImplementedError 1294 1295 1296 def _trigger_download(self, build, artifacts, files, synchronous=True, 1297 **kwargs_build_info): 1298 """Tell the devserver to download and stage image specified in 1299 kwargs_build_info. 1300 1301 Tells the devserver to fetch |image| from the image storage server 1302 named by _get_image_storage_server(). 1303 1304 If |synchronous| is True, waits for the entire download to finish 1305 staging before returning. Otherwise only the artifacts necessary 1306 to start installing images onto DUT's will be staged before returning. 1307 A caller can then call finish_download to guarantee the rest of the 1308 artifacts have finished staging. 1309 1310 @param synchronous: if True, waits until all components of the image are 1311 staged before returning. 1312 @param kwargs_build_info: Dictionary of build information. 1313 For CrOS, it is None as build is the CrOS image name. 1314 For Android, it is {'target': target, 1315 'build_id': build_id, 1316 'branch': branch} 1317 1318 @raise DevServerException upon any return code that's not HTTP OK. 1319 1320 """ 1321 if kwargs_build_info: 1322 archive_url = None 1323 else: 1324 archive_url = _get_image_storage_server() + build 1325 error_message = ("trigger_download for %s failed;" 1326 "HTTP OK not accompanied by 'Success'." % build) 1327 kwargs = {'archive_url': archive_url, 1328 'artifacts': artifacts, 1329 'files': files, 1330 'error_message': error_message} 1331 if kwargs_build_info: 1332 kwargs.update(kwargs_build_info) 1333 1334 logging.info('trigger_download starts for %s', build) 1335 try: 1336 response = self.call_and_wait(call_name='stage', **kwargs) 1337 logging.info('trigger_download finishes for %s', build) 1338 except (bin_utils.TimeoutError, error.TimeoutException): 1339 logging.error('trigger_download timed out for %s.', build) 1340 raise DevServerException( 1341 'trigger_download timed out for %s.' % build) 1342 was_successful = response == SUCCESS 1343 if was_successful and synchronous: 1344 self._finish_download(build, artifacts, files, **kwargs_build_info) 1345 1346 1347 def _finish_download(self, build, artifacts, files, **kwargs_build_info): 1348 """Tell the devserver to finish staging image specified in 1349 kwargs_build_info. 1350 1351 If trigger_download is called with synchronous=False, it will return 1352 before all artifacts have been staged. This method contacts the 1353 devserver and blocks until all staging is completed and should be 1354 called after a call to trigger_download. 1355 1356 @param kwargs_build_info: Dictionary of build information. 1357 For CrOS, it is None as build is the CrOS image name. 1358 For Android, it is {'target': target, 1359 'build_id': build_id, 1360 'branch': branch} 1361 1362 @raise DevServerException upon any return code that's not HTTP OK. 1363 """ 1364 archive_url = _get_image_storage_server() + build 1365 error_message = ("finish_download for %s failed;" 1366 "HTTP OK not accompanied by 'Success'." % build) 1367 kwargs = {'archive_url': archive_url, 1368 'artifacts': artifacts, 1369 'files': files, 1370 'error_message': error_message} 1371 if kwargs_build_info: 1372 kwargs.update(kwargs_build_info) 1373 try: 1374 self.call_and_wait(call_name='stage', **kwargs) 1375 except (bin_utils.TimeoutError, error.TimeoutException): 1376 logging.error('finish_download timed out for %s', build) 1377 raise DevServerException( 1378 'finish_download timed out for %s.' % build) 1379 1380 1381 @remote_devserver_call() 1382 def locate_file(self, file_name, artifacts, build, build_info): 1383 """Locate a file with the given file_name on devserver. 1384 1385 This method calls devserver RPC `locate_file` to look up a file with 1386 the given file name inside specified build artifacts. 1387 1388 @param file_name: Name of the file to look for a file. 1389 @param artifacts: A list of artifact names to search for the file. 1390 @param build: Name of the build. For Android, it's None as build_info 1391 should be used. 1392 @param build_info: Dictionary of build information. 1393 For CrOS, it is None as build is the CrOS image name. 1394 For Android, it is {'target': target, 1395 'build_id': build_id, 1396 'branch': branch} 1397 1398 @return: A devserver url to the file. 1399 @raise DevServerException upon any return code that's not HTTP OK. 1400 """ 1401 if not build and not build_info: 1402 raise DevServerException('You must specify build information to ' 1403 'look for file %s in artifacts %s.' % 1404 (file_name, artifacts)) 1405 kwargs = {'file_name': file_name, 1406 'artifacts': artifacts} 1407 if build_info: 1408 build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info 1409 kwargs.update(build_info) 1410 # Devserver treats Android and Brillo build in the same way as they 1411 # are both retrieved from Launch Control and have similar build 1412 # artifacts. Therefore, os_type for devserver calls is `android` for 1413 # both Android and Brillo builds. 1414 kwargs['os_type'] = 'android' 1415 else: 1416 build_path = build 1417 kwargs['build'] = build 1418 call = self.build_call('locate_file', async=False, **kwargs) 1419 try: 1420 file_path = self.run_call(call) 1421 return os.path.join(self.url(), 'static', build_path, file_path) 1422 except httplib.BadStatusLine as e: 1423 logging.error(e) 1424 raise DevServerException('Received Bad Status line, Devserver %s ' 1425 'might have gone down while handling ' 1426 'the call: %s' % (self.url(), call)) 1427 1428 1429 @remote_devserver_call() 1430 def list_control_files(self, build, suite_name=''): 1431 """Ask the devserver to list all control files for |build|. 1432 1433 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1434 whose control files the caller wants listed. 1435 @param suite_name: The name of the suite for which we require control 1436 files. 1437 @return None on failure, or a list of control file paths 1438 (e.g. server/site_tests/autoupdate/control) 1439 @raise DevServerException upon any return code that's not HTTP OK. 1440 """ 1441 build = self.translate(build) 1442 call = self.build_call('controlfiles', build=build, 1443 suite_name=suite_name) 1444 return self.run_call(call, readline=True) 1445 1446 1447 @remote_devserver_call() 1448 def get_control_file(self, build, control_path): 1449 """Ask the devserver for the contents of a control file. 1450 1451 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1452 whose control file the caller wants to fetch. 1453 @param control_path: The file to fetch 1454 (e.g. server/site_tests/autoupdate/control) 1455 @return The contents of the desired file. 1456 @raise DevServerException upon any return code that's not HTTP OK. 1457 """ 1458 build = self.translate(build) 1459 call = self.build_call('controlfiles', build=build, 1460 control_path=control_path) 1461 return self.run_call(call) 1462 1463 1464 @remote_devserver_call() 1465 def list_suite_controls(self, build, suite_name=''): 1466 """Ask the devserver to list contents of all control files for |build|. 1467 1468 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1469 whose control files' contents the caller wants returned. 1470 @param suite_name: The name of the suite for which we require control 1471 files. 1472 @return None on failure, or a dict of contents of all control files 1473 (e.g. {'path1': "#Copyright controls ***", ..., 1474 pathX': "#Copyright controls ***"} 1475 @raise DevServerException upon any return code that's not HTTP OK. 1476 """ 1477 build = self.translate(build) 1478 call = self.build_call('list_suite_controls', build=build, 1479 suite_name=suite_name) 1480 return json.load(cStringIO.StringIO(self.run_call(call))) 1481 1482 1483class ImageServer(ImageServerBase): 1484 """Class for DevServer that handles RPCs related to CrOS images. 1485 1486 The calls to devserver to stage artifacts, including stage and download, are 1487 made in async mode. That is, when caller makes an RPC |stage| to request 1488 devserver to stage certain artifacts, devserver handles the call and starts 1489 staging artifacts in a new thread, and return |Success| without waiting for 1490 staging being completed. When caller receives message |Success|, it polls 1491 devserver's is_staged call until all artifacts are staged. 1492 Such mechanism is designed to prevent cherrypy threads in devserver being 1493 running out, as staging artifacts might take long time, and cherrypy starts 1494 with a fixed number of threads that handle devserver rpc. 1495 """ 1496 1497 class ArtifactUrls(object): 1498 """A container for URLs of staged artifacts. 1499 1500 Attributes: 1501 full_payload: URL for downloading a staged full release update 1502 mton_payload: URL for downloading a staged M-to-N release update 1503 nton_payload: URL for downloading a staged N-to-N release update 1504 1505 """ 1506 def __init__(self, full_payload=None, mton_payload=None, 1507 nton_payload=None): 1508 self.full_payload = full_payload 1509 self.mton_payload = mton_payload 1510 self.nton_payload = nton_payload 1511 1512 1513 def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''): 1514 """Polling devserver.is_staged until all artifacts are staged. 1515 1516 @param archive_url: Google Storage URL for the build. 1517 @param artifacts: Comma separated list of artifacts to download. 1518 @param files: Comma separated list of files to download. 1519 @return: True if all artifacts are staged in devserver. 1520 """ 1521 kwargs = {'archive_url': archive_url, 1522 'artifacts': artifacts, 1523 'files': files} 1524 return self._poll_is_staged(**kwargs) 1525 1526 1527 @remote_devserver_call() 1528 def call_and_wait(self, call_name, archive_url, artifacts, files, 1529 error_message, expected_response=SUCCESS): 1530 """Helper method to make a urlopen call, and wait for artifacts staged. 1531 1532 @param call_name: name of devserver rpc call. 1533 @param archive_url: Google Storage URL for the build.. 1534 @param artifacts: Comma separated list of artifacts to download. 1535 @param files: Comma separated list of files to download. 1536 @param expected_response: Expected response from rpc, default to 1537 |Success|. If it's set to None, do not compare 1538 the actual response. Any response is consider 1539 to be good. 1540 @param error_message: Error message to be thrown if response does not 1541 match expected_response. 1542 1543 @return: The response from rpc. 1544 @raise DevServerException upon any return code that's expected_response. 1545 1546 """ 1547 kwargs = {'archive_url': archive_url, 1548 'artifacts': artifacts, 1549 'files': files} 1550 return self._call_and_wait(call_name, error_message, 1551 expected_response, **kwargs) 1552 1553 1554 @remote_devserver_call() 1555 def stage_artifacts(self, image=None, artifacts=None, files='', 1556 archive_url=None): 1557 """Tell the devserver to download and stage |artifacts| from |image|. 1558 1559 This is the main call point for staging any specific artifacts for a 1560 given build. To see the list of artifacts one can stage see: 1561 1562 ~src/platfrom/dev/artifact_info.py. 1563 1564 This is maintained along with the actual devserver code. 1565 1566 @param image: the image to fetch and stage. 1567 @param artifacts: A list of artifacts. 1568 @param files: A list of files to stage. 1569 @param archive_url: Optional parameter that has the archive_url to stage 1570 this artifact from. Default is specified in autotest config + 1571 image. 1572 1573 @raise DevServerException upon any return code that's not HTTP OK. 1574 """ 1575 if not artifacts and not files: 1576 raise DevServerException('Must specify something to stage.') 1577 image = self.translate(image) 1578 self._stage_artifacts(image, artifacts, files, archive_url) 1579 1580 1581 @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS) 1582 def list_image_dir(self, image): 1583 """List the contents of the image stage directory, on the devserver. 1584 1585 @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>. 1586 1587 @raise DevServerException upon any return code that's not HTTP OK. 1588 """ 1589 image = self.translate(image) 1590 logging.info('Requesting contents from devserver %s for image %s', 1591 self.url(), image) 1592 archive_url = _get_storage_server_for_artifacts() + image 1593 call = self.build_call('list_image_dir', archive_url=archive_url) 1594 response = self.run_call(call, readline=True) 1595 for line in response: 1596 logging.info(line) 1597 1598 1599 def trigger_download(self, image, synchronous=True): 1600 """Tell the devserver to download and stage |image|. 1601 1602 Tells the devserver to fetch |image| from the image storage server 1603 named by _get_image_storage_server(). 1604 1605 If |synchronous| is True, waits for the entire download to finish 1606 staging before returning. Otherwise only the artifacts necessary 1607 to start installing images onto DUT's will be staged before returning. 1608 A caller can then call finish_download to guarantee the rest of the 1609 artifacts have finished staging. 1610 1611 @param image: the image to fetch and stage. 1612 @param synchronous: if True, waits until all components of the image are 1613 staged before returning. 1614 1615 @raise DevServerException upon any return code that's not HTTP OK. 1616 1617 """ 1618 image = self.translate(image) 1619 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE 1620 self._trigger_download(image, artifacts, files='', 1621 synchronous=synchronous) 1622 1623 1624 @remote_devserver_call() 1625 def setup_telemetry(self, build): 1626 """Tell the devserver to setup telemetry for this build. 1627 1628 The devserver will stage autotest and then extract the required files 1629 for telemetry. 1630 1631 @param build: the build to setup telemetry for. 1632 1633 @returns path on the devserver that telemetry is installed to. 1634 """ 1635 build = self.translate(build) 1636 archive_url = _get_image_storage_server() + build 1637 call = self.build_call('setup_telemetry', archive_url=archive_url) 1638 try: 1639 response = self.run_call(call) 1640 except httplib.BadStatusLine as e: 1641 logging.error(e) 1642 raise DevServerException('Received Bad Status line, Devserver %s ' 1643 'might have gone down while handling ' 1644 'the call: %s' % (self.url(), call)) 1645 return response 1646 1647 1648 def finish_download(self, image): 1649 """Tell the devserver to finish staging |image|. 1650 1651 If trigger_download is called with synchronous=False, it will return 1652 before all artifacts have been staged. This method contacts the 1653 devserver and blocks until all staging is completed and should be 1654 called after a call to trigger_download. 1655 1656 @param image: the image to fetch and stage. 1657 @raise DevServerException upon any return code that's not HTTP OK. 1658 """ 1659 image = self.translate(image) 1660 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST 1661 self._finish_download(image, artifacts, files='') 1662 1663 1664 def get_update_url(self, image): 1665 """Returns the url that should be passed to the updater. 1666 1667 @param image: the image that was fetched. 1668 """ 1669 image = self.translate(image) 1670 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 1671 type=str) 1672 return (url_pattern % (self.url(), image)) 1673 1674 1675 def get_staged_file_url(self, filename, image): 1676 """Returns the url of a staged file for this image on the devserver.""" 1677 return '/'.join([self._get_image_url(image), filename]) 1678 1679 1680 def get_full_payload_url(self, image): 1681 """Returns a URL to a staged full payload. 1682 1683 @param image: the image that was fetched. 1684 1685 @return A fully qualified URL that can be used for downloading the 1686 payload. 1687 1688 """ 1689 return self._get_image_url(image) + '/update.gz' 1690 1691 1692 def get_test_image_url(self, image): 1693 """Returns a URL to a staged test image. 1694 1695 @param image: the image that was fetched. 1696 1697 @return A fully qualified URL that can be used for downloading the 1698 image. 1699 1700 """ 1701 return self._get_image_url(image) + '/chromiumos_test_image.bin' 1702 1703 1704 def get_recovery_image_url(self, image): 1705 """Returns a URL to a staged recovery image. 1706 1707 @param image: the image that was fetched. 1708 1709 @return A fully qualified URL that can be used for downloading the 1710 image. 1711 1712 """ 1713 return self._get_image_url(image) + '/recovery_image.bin' 1714 1715 1716 @remote_devserver_call() 1717 def get_dependencies_file(self, build): 1718 """Ask the dev server for the contents of the suite dependencies file. 1719 1720 Ask the dev server at |self._dev_server| for the contents of the 1721 pre-processed suite dependencies file (at DEPENDENCIES_FILE) 1722 for |build|. 1723 1724 @param build: The build (e.g. x86-mario-release/R21-2333.0.0) 1725 whose dependencies the caller is interested in. 1726 @return The contents of the dependencies file, which should eval to 1727 a dict of dicts, as per bin_utils/suite_preprocessor.py. 1728 @raise DevServerException upon any return code that's not HTTP OK. 1729 """ 1730 build = self.translate(build) 1731 call = self.build_call('controlfiles', 1732 build=build, control_path=DEPENDENCIES_FILE) 1733 return self.run_call(call) 1734 1735 1736 @remote_devserver_call() 1737 def get_latest_build_in_gs(self, board): 1738 """Ask the devservers for the latest offical build in Google Storage. 1739 1740 @param board: The board for who we want the latest official build. 1741 @return A string of the returned build rambi-release/R37-5868.0.0 1742 @raise DevServerException upon any return code that's not HTTP OK. 1743 """ 1744 call = self.build_call( 1745 'xbuddy_translate/remote/%s/latest-official' % board, 1746 image_dir=_get_image_storage_server()) 1747 image_name = self.run_call(call) 1748 return os.path.dirname(image_name) 1749 1750 1751 def translate(self, build_name): 1752 """Translate the build name if it's in LATEST format. 1753 1754 If the build name is in the format [builder]/LATEST, return the latest 1755 build in Google Storage otherwise return the build name as is. 1756 1757 @param build_name: build_name to check. 1758 1759 @return The actual build name to use. 1760 """ 1761 match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I) 1762 if not match: 1763 return build_name 1764 translated_build = self.get_latest_build_in_gs(match.groups()[0]) 1765 logging.debug('Translated relative build %s to %s', build_name, 1766 translated_build) 1767 return translated_build 1768 1769 1770 @classmethod 1771 @remote_devserver_call() 1772 def get_latest_build(cls, target, milestone=''): 1773 """Ask all the devservers for the latest build for a given target. 1774 1775 @param target: The build target, typically a combination of the board 1776 and the type of build e.g. x86-mario-release. 1777 @param milestone: For latest build set to '', for builds only in a 1778 specific milestone set to a str of format Rxx 1779 (e.g. R16). Default: ''. Since we are dealing with a 1780 webserver sending an empty string, '', ensures that 1781 the variable in the URL is ignored as if it was set 1782 to None. 1783 @return A string of the returned build e.g. R20-2226.0.0. 1784 @raise DevServerException upon any return code that's not HTTP OK. 1785 """ 1786 calls = cls.build_all_calls('latestbuild', target=target, 1787 milestone=milestone) 1788 latest_builds = [] 1789 for call in calls: 1790 latest_builds.append(cls.run_call(call)) 1791 1792 return max(latest_builds, key=version.LooseVersion) 1793 1794 1795 @remote_devserver_call() 1796 def _kill_au_process_for_host(self, **kwargs): 1797 """Kill the triggerred auto_update process if error happens in cros_au. 1798 1799 @param kwargs: Arguments to make kill_au_proc devserver call. 1800 """ 1801 call = self.build_call('kill_au_proc', **kwargs) 1802 response = self.run_call(call) 1803 if not response == 'True': 1804 raise DevServerException( 1805 'Failed to kill the triggerred CrOS auto_update process' 1806 'on devserver %s, the response is %s' % ( 1807 self.url(), response)) 1808 1809 1810 def kill_au_process_for_host(self, host_name, pid): 1811 """Kill the triggerred auto_update process if error happens. 1812 1813 Usually this function is used to clear all potential left au processes 1814 of the given host name. 1815 1816 If pid is specified, the devserver will further check the given pid to 1817 make sure the process is killed. This is used for the case that the au 1818 process has started in background, but then provision fails due to 1819 some unknown issues very fast. In this case, when 'kill_au_proc' is 1820 called, there's no corresponding background track log created for this 1821 ongoing au process, which prevents this RPC call from killing this au 1822 process. 1823 1824 @param host_name: The DUT's hostname. 1825 @param pid: The ongoing au process's pid. 1826 1827 @return: True if successfully kill the auto-update process for host. 1828 """ 1829 kwargs = {'host_name': host_name, 'pid': pid} 1830 try: 1831 self._kill_au_process_for_host(**kwargs) 1832 except DevServerException: 1833 return False 1834 1835 return True 1836 1837 1838 @remote_devserver_call() 1839 def _clean_track_log(self, **kwargs): 1840 """Clean track log for the current auto-update process.""" 1841 call = self.build_call('handler_cleanup', **kwargs) 1842 self.run_call(call) 1843 1844 1845 def clean_track_log(self, host_name, pid): 1846 """Clean track log for the current auto-update process. 1847 1848 @param host_name: The host name to be updated. 1849 @param pid: The auto-update process id. 1850 1851 @return: True if track log is successfully cleaned, False otherwise. 1852 """ 1853 if not pid: 1854 return False 1855 1856 kwargs = {'host_name': host_name, 'pid': pid} 1857 try: 1858 self._clean_track_log(**kwargs) 1859 except DevServerException as e: 1860 logging.debug('Failed to clean track_status_file on ' 1861 'devserver for host %s and process id %s: %s', 1862 host_name, pid, str(e)) 1863 return False 1864 1865 return True 1866 1867 1868 def _get_au_log_filename(self, log_dir, host_name, pid): 1869 """Return the auto-update log's filename.""" 1870 return os.path.join(log_dir, CROS_AU_LOG_FILENAME % ( 1871 host_name, pid)) 1872 1873 def _read_json_response_from_devserver(self, response): 1874 """Reads the json response from the devserver. 1875 1876 This is extracted to its own function so that it can be easily mocked. 1877 @param response: the response for a devserver. 1878 """ 1879 try: 1880 return json.loads(response) 1881 except ValueError as e: 1882 logging.debug('Failed to load json response: %s', response) 1883 raise DevServerException(e) 1884 1885 1886 @remote_devserver_call() 1887 def _collect_au_log(self, log_dir, **kwargs): 1888 """Collect logs from devserver after cros-update process is finished. 1889 1890 Collect the logs that recording the whole cros-update process, and 1891 write it to sysinfo path of a job. 1892 1893 The example log file name that is stored is like: 1894 '1220-repair/sysinfo/CrOS_update_host_name_pid.log' 1895 1896 @param host_name: the DUT's hostname. 1897 @param pid: the auto-update process id on devserver. 1898 @param log_dir: The directory to save the cros-update process log 1899 retrieved from devserver. 1900 """ 1901 call = self.build_call('collect_cros_au_log', **kwargs) 1902 response = self.run_call(call) 1903 if not os.path.exists(log_dir): 1904 os.mkdir(log_dir) 1905 write_file = self._get_au_log_filename( 1906 log_dir, kwargs['host_name'], kwargs['pid']) 1907 logging.debug('Saving auto-update logs into %s', write_file) 1908 1909 au_logs = self._read_json_response_from_devserver(response) 1910 1911 try: 1912 for k, v in au_logs['host_logs'].items(): 1913 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid']) 1914 log_path = os.path.join(log_dir, log_name) 1915 with open(log_path, 'w') as out_log: 1916 out_log.write(v) 1917 except IOError as e: 1918 raise DevServerException('Failed to write auto-update hostlogs: ' 1919 '%s' % e) 1920 1921 try: 1922 with open(write_file, 'w') as out_log: 1923 out_log.write(au_logs['cros_au_log']) 1924 except: 1925 raise DevServerException('Failed to write auto-update logs into ' 1926 '%s' % write_file) 1927 1928 1929 def collect_au_log(self, host_name, pid, log_dir): 1930 """Collect logs from devserver after cros-update process is finished. 1931 1932 @param host_name: the DUT's hostname. 1933 @param pid: the auto-update process id on devserver. 1934 @param log_dir: The directory to save the cros-update process log 1935 retrieved from devserver. 1936 1937 @return: True if auto-update log is successfully collected, False 1938 otherwise. 1939 """ 1940 if not pid: 1941 return False 1942 1943 kwargs = {'host_name': host_name, 'pid': pid} 1944 try: 1945 self._collect_au_log(log_dir, **kwargs) 1946 except DevServerException as e: 1947 logging.debug('Failed to collect auto-update log on ' 1948 'devserver for host %s and process id %s: %s', 1949 host_name, pid, str(e)) 1950 return False 1951 1952 return True 1953 1954 1955 @remote_devserver_call() 1956 def _trigger_auto_update(self, **kwargs): 1957 """Trigger auto-update by calling devserver.cros_au. 1958 1959 @param kwargs: Arguments to make cros_au devserver call. 1960 1961 @return: a tuple indicates whether the RPC call cros_au succeeds and 1962 the auto-update process id running on devserver. 1963 """ 1964 host_name = kwargs['host_name'] 1965 call = self.build_call('cros_au', async=True, **kwargs) 1966 try: 1967 response = self.run_call(call) 1968 logging.info( 1969 'Received response from devserver for cros_au call: %r', 1970 response) 1971 except httplib.BadStatusLine as e: 1972 logging.error(e) 1973 raise DevServerException('Received Bad Status line, Devserver %s ' 1974 'might have gone down while handling ' 1975 'the call: %s' % (self.url(), call)) 1976 1977 return response 1978 1979 1980 def _check_for_auto_update_finished(self, pid, wait=True, **kwargs): 1981 """Polling devserver.get_au_status to get current auto-update status. 1982 1983 The current auto-update status is used to identify whether the update 1984 process is finished. 1985 1986 @param pid: The background process id for auto-update in devserver. 1987 @param kwargs: keyword arguments to make get_au_status devserver call. 1988 @param wait: Should the check wait for completion. 1989 1990 @return: True if auto-update is finished for a given dut. 1991 """ 1992 logging.debug('Check the progress for auto-update process %r', pid) 1993 kwargs['pid'] = pid 1994 call = self.build_call('get_au_status', **kwargs) 1995 1996 def all_finished(): 1997 """Call devserver.get_au_status rpc to check if auto-update 1998 is finished. 1999 2000 @return: True if auto-update is finished for a given dut. False 2001 otherwise. 2002 @rasies DevServerException, the exception is a wrapper of all 2003 exceptions that were raised when devserver tried to 2004 download the artifacts. devserver raises an HTTPError or 2005 a CmdError when an exception was raised in the code. Such 2006 exception should be re-raised here to stop the caller from 2007 waiting. If the call to devserver failed for connection 2008 issue, a URLError exception is raised, and caller should 2009 retry the call to avoid such network flakiness. 2010 2011 """ 2012 try: 2013 au_status = self.run_call(call) 2014 response = json.loads(au_status) 2015 # This is a temp fix to fit both dict and tuple returning 2016 # values. The dict check will be removed after a corresponding 2017 # devserver CL is deployed. 2018 if isinstance(response, dict): 2019 if response.get('detailed_error_msg'): 2020 raise DevServerException( 2021 response.get('detailed_error_msg')) 2022 2023 if response.get('finished'): 2024 logging.debug('CrOS auto-update is finished') 2025 return True 2026 else: 2027 logging.debug('Current CrOS auto-update status: %s', 2028 response.get('status')) 2029 return False 2030 2031 if not response[0]: 2032 logging.debug('Current CrOS auto-update status: %s', 2033 response[1]) 2034 return False 2035 else: 2036 logging.debug('CrOS auto-update is finished') 2037 return True 2038 except urllib2.HTTPError as e: 2039 error_markup = e.read() 2040 raise DevServerException(_strip_http_message(error_markup)) 2041 except urllib2.URLError as e: 2042 # Could be connection issue, retry it. 2043 # For example: <urlopen error [Errno 111] Connection refused> 2044 logging.warning('URLError (%r): Retrying connection to ' 2045 'devserver to check auto-update status.', e) 2046 return False 2047 except error.CmdError: 2048 # Retry if SSH failed to connect to the devserver. 2049 logging.warning('CmdError: Retrying SSH connection to check ' 2050 'auto-update status.') 2051 return False 2052 except socket.error as e: 2053 # Could be some temporary devserver connection issues. 2054 logging.warning('Socket Error (%r): Retrying connection to ' 2055 'devserver to check auto-update status.', e) 2056 return False 2057 except ValueError as e: 2058 raise DevServerException( 2059 '%s (Got AU status: %r)' % (str(e), au_status)) 2060 2061 if wait: 2062 bin_utils.poll_for_condition( 2063 all_finished, 2064 exception=bin_utils.TimeoutError(), 2065 timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60, 2066 sleep_interval=CROS_AU_POLLING_INTERVAL) 2067 2068 return True 2069 else: 2070 return all_finished() 2071 2072 2073 def check_for_auto_update_finished(self, response, wait=True, **kwargs): 2074 """Processing response of 'cros_au' and polling for auto-update status. 2075 2076 Will wait for the whole auto-update process is finished. 2077 2078 @param response: The response from RPC 'cros_au' 2079 @param kwargs: keyword arguments to make get_au_status devserver call. 2080 2081 @return: a tuple includes two elements. 2082 finished: True if the operation has completed. 2083 raised_error: None if everything works well or the raised error. 2084 pid: the auto-update process id on devserver. 2085 """ 2086 2087 pid = 0 2088 raised_error = None 2089 finished = False 2090 try: 2091 response = json.loads(response) 2092 if response[0]: 2093 pid = response[1] 2094 # If provision is kicked off asynchronously, pid will be -1. 2095 # If provision is not successfully kicked off , pid continues 2096 # to be 0. 2097 if pid > 0: 2098 logging.debug('start process %r for auto_update in ' 2099 'devserver', pid) 2100 finished = self._check_for_auto_update_finished( 2101 pid, wait=wait, **kwargs) 2102 except Exception as e: 2103 logging.debug('Failed to trigger auto-update process on devserver') 2104 finished = True 2105 raised_error = e 2106 finally: 2107 return finished, raised_error, pid 2108 2109 2110 def _check_error_message(self, error_patterns_to_check, error_msg): 2111 """Detect whether specific error pattern exist in error message. 2112 2113 @param error_patterns_to_check: the error patterns to check 2114 @param error_msg: the error message which may include any error 2115 pattern. 2116 2117 @return A boolean variable, True if error_msg contains any error 2118 pattern in error_patterns_to_check, False otherwise. 2119 """ 2120 for err in error_patterns_to_check: 2121 if err in error_msg: 2122 return True 2123 2124 return False 2125 2126 2127 def _is_retryable(self, error_msg): 2128 """Detect whether we will retry auto-update based on error_msg. 2129 2130 @param error_msg: The given error message. 2131 2132 @return A boolean variable which indicates whether we will retry 2133 auto_update with another devserver based on the given error_msg. 2134 """ 2135 # For now we just hard-code the error message we think it's suspicious. 2136 # When we get more date about what's the json response when devserver 2137 # is overloaded, we can update this part. 2138 retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE, 2139 'is not pingable'] 2140 return self._check_error_message(retryable_error_patterns, error_msg) 2141 2142 2143 def _should_use_original_payload(self, error_msg): 2144 devserver_error_patterns = ['DevserverCannotStartError'] 2145 return self._check_error_message(devserver_error_patterns, error_msg) 2146 2147 2148 def _parse_buildname_safely(self, build_name): 2149 """Parse a given buildname safely. 2150 2151 @param build_name: the build name to be parsed. 2152 2153 @return: a tuple (board, build_type, milestone) 2154 """ 2155 try: 2156 board, build_type, milestone, _ = server_utils.ParseBuildName( 2157 build_name) 2158 except server_utils.ParseBuildNameException: 2159 logging.warning('Unable to parse build name %s for metrics. ' 2160 'Continuing anyway.', build_name) 2161 board, build_type, milestone = ('', '', '') 2162 2163 return board, build_type, milestone 2164 2165 2166 def _emit_auto_update_metrics(self, board, build_type, dut_host_name, 2167 build_name, attempt, 2168 success, failure_reason, duration): 2169 """Send metrics for a single auto_update attempt. 2170 2171 @param board: a field in metrics representing which board this 2172 auto_update tries to update. 2173 @param build_type: a field in metrics representing which build type this 2174 auto_update tries to update. 2175 @param dut_host_name: a field in metrics representing which DUT this 2176 auto_update tries to update. 2177 @param build_name: auto update build being updated to. 2178 @param attempt: a field in metrics, representing which attempt/retry 2179 this auto_update is. 2180 @param success: a field in metrics, representing whether this 2181 auto_update succeeds or not. 2182 @param failure_reason: DevServerExceptionClassifier object to show 2183 auto update failure reason, or None. 2184 @param duration: auto update duration time, in seconds. 2185 """ 2186 # The following is high cardinality, but sparse. 2187 # Each DUT is of a single board type, and likely build type. 2188 # The affinity also results in each DUT being attached to the same 2189 # dev_server as well. 2190 fields = { 2191 'board': board, 2192 'build_type': build_type, 2193 'dut_host_name': dut_host_name, 2194 'dev_server': self.resolved_hostname, 2195 'attempt': attempt, 2196 'success': success, 2197 } 2198 2199 # reset_after=True is required for String gauges events to ensure that 2200 # the metrics are not repeatedly emitted until the server restarts. 2201 2202 metrics.String(PROVISION_PATH + '/auto_update_build_by_devserver_dut', 2203 reset_after=True).set(build_name, fields=fields) 2204 2205 if not success: 2206 metrics.String( 2207 PROVISION_PATH + 2208 '/auto_update_failure_reason_by_devserver_dut', 2209 reset_after=True).set( 2210 failure_reason.classification if failure_reason else '', 2211 fields=fields) 2212 2213 metrics.SecondsDistribution( 2214 PROVISION_PATH + '/auto_update_duration_by_devserver_dut').add( 2215 duration, fields=fields) 2216 2217 2218 def _emit_provision_metrics(self, error_list, duration_list, 2219 is_au_success, board, build_type, milestone, 2220 dut_host_name, is_aue2etest, 2221 total_duration, build_name): 2222 """Send metrics for provision request. 2223 2224 Provision represents potentially multiple auto update attempts. 2225 2226 Please note: to avoid reaching or exceeding the monarch field 2227 cardinality limit, we avoid a metric that includes both dut hostname 2228 and other high cardinality fields. 2229 2230 @param error_list: a list of DevServerExceptionClassifier objects to 2231 show errors happened in provision. Usually it contains 1 ~ 2232 AU_RETRY_LIMIT objects since we only retry provision for several 2233 times. 2234 @param duration_list: a list of provision duration time, counted by 2235 seconds. 2236 @param is_au_success: a field in metrics, representing whether this 2237 auto_update succeeds or not. 2238 @param board: a field in metrics representing which board this 2239 auto_update tries to update. 2240 @param build_type: a field in metrics representing which build type this 2241 auto_update tries to update. 2242 @param milestone: a field in metrics representing which milestone this 2243 auto_update tries to update. 2244 @param dut_host_name: a field in metrics representing which DUT this 2245 auto_update tries to update. 2246 @param is_aue2etest: a field in metrics representing if provision was 2247 done as part of the autoupdate_EndToEndTest. 2248 """ 2249 # The following is high cardinality, but sparse. 2250 # Each DUT is of a single board type, and likely build type. 2251 # The affinity also results in each DUT being attached to the same 2252 # dev_server as well. 2253 fields = { 2254 'board': board, 2255 'build_type': build_type, 2256 'dut_host_name': dut_host_name, 2257 'dev_server': self.resolved_hostname, 2258 'success': is_au_success, 2259 } 2260 2261 # reset_after=True is required for String gauges events to ensure that 2262 # the metrics are not repeatedly emitted until the server restarts. 2263 2264 metrics.String(PROVISION_PATH + '/provision_build_by_devserver_dut', 2265 reset_after=True).set(build_name, fields=fields) 2266 2267 if error_list: 2268 metrics.String( 2269 PROVISION_PATH + 2270 '/provision_failure_reason_by_devserver_dut', 2271 reset_after=True).set(error_list[0].classification, 2272 fields=fields) 2273 2274 metrics.SecondsDistribution( 2275 PROVISION_PATH + '/provision_duration_by_devserver_dut').add( 2276 total_duration, fields=fields) 2277 2278 2279 def _parse_buildname_from_gs_uri(self, uri): 2280 """Get parameters needed for AU metrics when build_name is not known. 2281 2282 autoupdate_EndToEndTest is run with two Google Storage URIs from the 2283 gs://chromeos-releases bucket. URIs in this bucket do not have the 2284 build_name in the format samus-release/R60-0000.0.0. 2285 2286 We can get the milestone and board by checking the instructions.json 2287 file contained in the bucket with the payloads. 2288 2289 @param uri: The partial uri we received from autoupdate_EndToEndTest. 2290 """ 2291 try: 2292 # Get the instructions file that contains info about the build. 2293 gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json' 2294 files = bin_utils.gs_ls(gs_file) 2295 for f in files: 2296 gs_folder, _, instruction_file = f.rpartition('/') 2297 self.stage_artifacts(image=uri, 2298 files=[instruction_file], 2299 archive_url=gs_folder) 2300 json_file = self.get_staged_file_url(instruction_file, uri) 2301 response = urllib2.urlopen(json_file) 2302 data = json.load(response) 2303 return data['board'], 'release', data['version']['milestone'] 2304 except (ValueError, error.CmdError, urllib2.URLError) as e: 2305 logging.debug('Problem getting values for metrics: %s', e) 2306 logging.warning('Unable to parse build name %s from AU test for ' 2307 'metrics. Continuing anyway.', uri) 2308 2309 return '', '', '' 2310 2311 2312 def auto_update(self, host_name, build_name, original_board=None, 2313 original_release_version=None, log_dir=None, 2314 force_update=False, full_update=False, 2315 payload_filename=None, force_original=False, 2316 clobber_stateful=True, quick_provision=False): 2317 """Auto-update a CrOS host. 2318 2319 @param host_name: The hostname of the DUT to auto-update. 2320 @param build_name: The build name to be auto-updated on the DUT. 2321 @param original_board: The original board of the DUT to auto-update. 2322 @param original_release_version: The release version of the DUT's 2323 current build. 2324 @param log_dir: The log directory to store auto-update logs from 2325 devserver. 2326 @param force_update: Force an update even if the version installed 2327 is the same. Default: False. 2328 @param full_update: If True, do not run stateful update, directly 2329 force a full reimage. If False, try stateful 2330 update first if the dut is already installed 2331 with the same version. 2332 @param payload_filename: Used to specify the exact file to 2333 use for autoupdating. If None, the payload 2334 will be determined by build_name. You 2335 must have already staged this file before 2336 passing it in here. 2337 @param force_original: Whether to force stateful update with the 2338 original payload. 2339 @param clobber_stateful: If True do a clean install of stateful. 2340 @param quick_provision: Attempt to use quick provision path first. 2341 2342 @return A set (is_success, pid) in which: 2343 1. is_success indicates whether this auto_update succeeds. 2344 2. pid is the process id of the successful autoupdate run. 2345 2346 @raise DevServerException if auto_update fails and is not retryable. 2347 @raise RetryableProvisionException if it fails and is retryable. 2348 """ 2349 kwargs = {'host_name': host_name, 2350 'build_name': build_name, 2351 'force_update': force_update, 2352 'full_update': full_update, 2353 'clobber_stateful': clobber_stateful, 2354 'quick_provision': quick_provision} 2355 2356 is_aue2etest = payload_filename is not None 2357 2358 if is_aue2etest: 2359 kwargs['payload_filename'] = payload_filename 2360 2361 error_msg = 'CrOS auto-update failed for host %s: %s' 2362 error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s' 2363 is_au_success = False 2364 au_log_dir = os.path.join(log_dir, 2365 AUTO_UPDATE_LOG_DIR) if log_dir else None 2366 error_list = [] 2367 retry_with_another_devserver = False 2368 duration_list = [] 2369 2370 if is_aue2etest: 2371 board, build_type, milestone = self._parse_buildname_from_gs_uri( 2372 build_name) 2373 else: 2374 board, build_type, milestone = self._parse_buildname_safely( 2375 build_name) 2376 2377 provision_start_time = time.time() 2378 for au_attempt in range(AU_RETRY_LIMIT): 2379 logging.debug('Start CrOS auto-update for host %s at %d time(s).', 2380 host_name, au_attempt + 1) 2381 au_start_time = time.time() 2382 failure_reason = None 2383 # No matter _trigger_auto_update succeeds or fails, the auto-update 2384 # track_status_file should be cleaned, and the auto-update execute 2385 # log should be collected to directory sysinfo. Also, the error 2386 # raised by _trigger_auto_update should be displayed. 2387 try: 2388 # Try update with stateful.tgz of old release version in the 2389 # last try of auto-update. 2390 if force_original and original_release_version: 2391 # Monitor this case in monarch 2392 original_build = '%s/%s' % (original_board, 2393 original_release_version) 2394 c = metrics.Counter( 2395 'chromeos/autotest/provision/' 2396 'cros_update_with_original_build') 2397 f = {'dev_server': self.resolved_hostname, 2398 'board': board, 2399 'build_type': build_type, 2400 'milestone': milestone, 2401 'original_build': original_build} 2402 c.increment(fields=f) 2403 2404 logging.debug('Try updating stateful partition of the ' 2405 'host with the same version of its current ' 2406 'rootfs partition: %s', original_build) 2407 response = self._trigger_auto_update( 2408 original_build=original_build, **kwargs) 2409 else: 2410 response = self._trigger_auto_update(**kwargs) 2411 except DevServerException as e: 2412 logging.debug(error_msg_attempt, au_attempt+1, str(e)) 2413 failure_reason = DevServerExceptionClassifier(str(e)) 2414 else: 2415 _, raised_error, pid = self.check_for_auto_update_finished( 2416 response, **kwargs) 2417 2418 # Error happens in _collect_au_log won't be raised. 2419 if au_log_dir: 2420 is_collect_success = self.collect_au_log( 2421 kwargs['host_name'], pid, au_log_dir) 2422 else: 2423 is_collect_success = True 2424 2425 # Error happens in _clean_track_log won't be raised. 2426 if pid >= 0: 2427 is_clean_success = self.clean_track_log( 2428 kwargs['host_name'], pid) 2429 else: 2430 is_clean_success = True 2431 2432 # If any error is raised previously, log it and retry 2433 # auto-update. Otherwise, claim a successful CrOS auto-update. 2434 if (not raised_error and is_clean_success and 2435 is_collect_success): 2436 logging.debug('CrOS auto-update succeed for host %s', 2437 host_name) 2438 is_au_success = True 2439 break 2440 else: 2441 if not self.kill_au_process_for_host(kwargs['host_name'], 2442 pid): 2443 logging.debug('Failed to kill auto_update process %d', 2444 pid) 2445 if raised_error: 2446 error_str = str(raised_error) 2447 logging.debug(error_msg_attempt, au_attempt + 1, 2448 error_str) 2449 if au_log_dir: 2450 logging.debug('Please see error details in log %s', 2451 self._get_au_log_filename( 2452 au_log_dir, 2453 kwargs['host_name'], 2454 pid)) 2455 failure_reason = DevServerExceptionClassifier( 2456 error_str, keep_full_trace=False) 2457 if self._is_retryable(error_str): 2458 retry_with_another_devserver = True 2459 2460 if self._should_use_original_payload(error_str): 2461 force_original = True 2462 2463 finally: 2464 duration = int(time.time() - au_start_time) 2465 duration_list.append(duration) 2466 if failure_reason: 2467 error_list.append(failure_reason) 2468 self._emit_auto_update_metrics(board, build_type, host_name, 2469 build_name, au_attempt + 1, 2470 is_au_success, failure_reason, 2471 duration) 2472 if retry_with_another_devserver: 2473 break 2474 2475 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1: 2476 time.sleep(CROS_AU_RETRY_INTERVAL) 2477 # Use the IP of DUT if the hostname failed. 2478 host_name_ip = socket.gethostbyname(host_name) 2479 kwargs['host_name'] = host_name_ip 2480 logging.debug( 2481 'AU failed, trying IP instead of hostname: %s', 2482 host_name_ip) 2483 2484 total_duration = int(time.time() - provision_start_time) 2485 self._emit_provision_metrics(error_list, duration_list, is_au_success, 2486 board, build_type, milestone, host_name, 2487 is_aue2etest, total_duration, build_name) 2488 2489 if is_au_success: 2490 return (is_au_success, pid) 2491 2492 # If errors happen in the CrOS AU process, report the concatenation 2493 # of the errors happening in first & second provision. 2494 # If error happens in RPCs of cleaning track log, collecting 2495 # auto-update logs, or killing auto-update processes, just report a 2496 # common error here. 2497 if error_list: 2498 real_error = ', '.join(['%d) %s' % (i, e.summary) 2499 for i, e in enumerate(error_list)]) 2500 if retry_with_another_devserver: 2501 raise RetryableProvisionException( 2502 error_msg % (host_name, real_error)) 2503 else: 2504 raise error_list[0].classified_exception( 2505 error_msg % (host_name, real_error)) 2506 else: 2507 raise DevServerException(error_msg % ( 2508 host_name, ('RPC calls after the whole auto-update ' 2509 'process failed.'))) 2510 2511 2512class AndroidBuildServer(ImageServerBase): 2513 """Class for DevServer that handles RPCs related to Android builds. 2514 2515 The calls to devserver to stage artifacts, including stage and download, are 2516 made in async mode. That is, when caller makes an RPC |stage| to request 2517 devserver to stage certain artifacts, devserver handles the call and starts 2518 staging artifacts in a new thread, and return |Success| without waiting for 2519 staging being completed. When caller receives message |Success|, it polls 2520 devserver's is_staged call until all artifacts are staged. 2521 Such mechanism is designed to prevent cherrypy threads in devserver being 2522 running out, as staging artifacts might take long time, and cherrypy starts 2523 with a fixed number of threads that handle devserver rpc. 2524 """ 2525 2526 def wait_for_artifacts_staged(self, target, build_id, branch, 2527 archive_url=None, artifacts='', files=''): 2528 """Polling devserver.is_staged until all artifacts are staged. 2529 2530 @param target: Target of the android build to stage, e.g., 2531 shamu-userdebug. 2532 @param build_id: Build id of the android build to stage. 2533 @param branch: Branch of the android build to stage. 2534 @param archive_url: Google Storage URL for the build. 2535 @param artifacts: Comma separated list of artifacts to download. 2536 @param files: Comma separated list of files to download. 2537 2538 @return: True if all artifacts are staged in devserver. 2539 """ 2540 kwargs = {'target': target, 2541 'build_id': build_id, 2542 'branch': branch, 2543 'artifacts': artifacts, 2544 'files': files, 2545 'os_type': 'android'} 2546 if archive_url: 2547 kwargs['archive_url'] = archive_url 2548 return self._poll_is_staged(**kwargs) 2549 2550 2551 @remote_devserver_call() 2552 def call_and_wait(self, call_name, target, build_id, branch, archive_url, 2553 artifacts, files, error_message, 2554 expected_response=SUCCESS): 2555 """Helper method to make a urlopen call, and wait for artifacts staged. 2556 2557 @param call_name: name of devserver rpc call. 2558 @param target: Target of the android build to stage, e.g., 2559 shamu-userdebug. 2560 @param build_id: Build id of the android build to stage. 2561 @param branch: Branch of the android build to stage. 2562 @param archive_url: Google Storage URL for the CrOS build. 2563 @param artifacts: Comma separated list of artifacts to download. 2564 @param files: Comma separated list of files to download. 2565 @param expected_response: Expected response from rpc, default to 2566 |Success|. If it's set to None, do not compare 2567 the actual response. Any response is consider 2568 to be good. 2569 @param error_message: Error message to be thrown if response does not 2570 match expected_response. 2571 2572 @return: The response from rpc. 2573 @raise DevServerException upon any return code that's expected_response. 2574 2575 """ 2576 kwargs = {'target': target, 2577 'build_id': build_id, 2578 'branch': branch, 2579 'artifacts': artifacts, 2580 'files': files, 2581 'os_type': 'android'} 2582 if archive_url: 2583 kwargs['archive_url'] = archive_url 2584 return self._call_and_wait(call_name, error_message, expected_response, 2585 **kwargs) 2586 2587 2588 @remote_devserver_call() 2589 def stage_artifacts(self, target=None, build_id=None, branch=None, 2590 image=None, artifacts=None, files='', archive_url=None): 2591 """Tell the devserver to download and stage |artifacts| from |image|. 2592 2593 This is the main call point for staging any specific artifacts for a 2594 given build. To see the list of artifacts one can stage see: 2595 2596 ~src/platfrom/dev/artifact_info.py. 2597 2598 This is maintained along with the actual devserver code. 2599 2600 @param target: Target of the android build to stage, e.g., 2601 shamu-userdebug. 2602 @param build_id: Build id of the android build to stage. 2603 @param branch: Branch of the android build to stage. 2604 @param image: Name of a build to test, in the format of 2605 branch/target/build_id 2606 @param artifacts: A list of artifacts. 2607 @param files: A list of files to stage. 2608 @param archive_url: Optional parameter that has the archive_url to stage 2609 this artifact from. Default is specified in autotest config + 2610 image. 2611 2612 @raise DevServerException upon any return code that's not HTTP OK. 2613 """ 2614 if image and not target and not build_id and not branch: 2615 branch, target, build_id = utils.parse_launch_control_build(image) 2616 if not target or not build_id or not branch: 2617 raise DevServerException('Must specify all build info (target, ' 2618 'build_id and branch) to stage.') 2619 2620 android_build_info = {'target': target, 2621 'build_id': build_id, 2622 'branch': branch} 2623 if not artifacts and not files: 2624 raise DevServerException('Must specify something to stage.') 2625 if not all(android_build_info.values()): 2626 raise DevServerException( 2627 'To stage an Android build, must specify target, build id ' 2628 'and branch.') 2629 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2630 self._stage_artifacts(build, artifacts, files, archive_url, 2631 **android_build_info) 2632 2633 def trigger_download(self, target, build_id, branch, artifacts=None, 2634 files='', os='android', synchronous=True): 2635 """Tell the devserver to download and stage an Android build. 2636 2637 Tells the devserver to fetch an Android build from the image storage 2638 server named by _get_image_storage_server(). 2639 2640 If |synchronous| is True, waits for the entire download to finish 2641 staging before returning. Otherwise only the artifacts necessary 2642 to start installing images onto DUT's will be staged before returning. 2643 A caller can then call finish_download to guarantee the rest of the 2644 artifacts have finished staging. 2645 2646 @param target: Target of the android build to stage, e.g., 2647 shamu-userdebug. 2648 @param build_id: Build id of the android build to stage. 2649 @param branch: Branch of the android build to stage. 2650 @param artifacts: A string of artifacts separated by comma. If None, 2651 use the default artifacts for Android or Brillo build. 2652 @param files: String of file seperated by commas. 2653 @param os: OS artifacts to download (android/brillo). 2654 @param synchronous: if True, waits until all components of the image are 2655 staged before returning. 2656 2657 @raise DevServerException upon any return code that's not HTTP OK. 2658 2659 """ 2660 android_build_info = {'target': target, 2661 'build_id': build_id, 2662 'branch': branch} 2663 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2664 if not artifacts: 2665 board = target.split('-')[0] 2666 artifacts = ( 2667 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2668 board, os)) 2669 self._trigger_download(build, artifacts, files=files, 2670 synchronous=synchronous, **android_build_info) 2671 2672 2673 def finish_download(self, target, build_id, branch, os='android'): 2674 """Tell the devserver to finish staging an Android build. 2675 2676 If trigger_download is called with synchronous=False, it will return 2677 before all artifacts have been staged. This method contacts the 2678 devserver and blocks until all staging is completed and should be 2679 called after a call to trigger_download. 2680 2681 @param target: Target of the android build to stage, e.g., 2682 shamu-userdebug. 2683 @param build_id: Build id of the android build to stage. 2684 @param branch: Branch of the android build to stage. 2685 @param os: OS artifacts to download (android/brillo). 2686 2687 @raise DevServerException upon any return code that's not HTTP OK. 2688 """ 2689 android_build_info = {'target': target, 2690 'build_id': build_id, 2691 'branch': branch} 2692 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2693 board = target.split('-')[0] 2694 artifacts = ( 2695 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2696 board)) 2697 self._finish_download(build, artifacts, files='', **android_build_info) 2698 2699 2700 def get_staged_file_url(self, filename, target, build_id, branch): 2701 """Returns the url of a staged file for this image on the devserver. 2702 2703 @param filename: Name of the file. 2704 @param target: Target of the android build to stage, e.g., 2705 shamu-userdebug. 2706 @param build_id: Build id of the android build to stage. 2707 @param branch: Branch of the android build to stage. 2708 2709 @return: The url of a staged file for this image on the devserver. 2710 """ 2711 android_build_info = {'target': target, 2712 'build_id': build_id, 2713 'branch': branch, 2714 'os_type': 'android'} 2715 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2716 return '/'.join([self._get_image_url(build), filename]) 2717 2718 2719 @remote_devserver_call() 2720 def translate(self, build_name): 2721 """Translate the build name if it's in LATEST format. 2722 2723 If the build name is in the format [branch]/[target]/LATEST, return the 2724 latest build in Launch Control otherwise return the build name as is. 2725 2726 @param build_name: build_name to check. 2727 2728 @return The actual build name to use. 2729 """ 2730 branch, target, build_id = utils.parse_launch_control_build(build_name) 2731 if build_id.upper() != 'LATEST': 2732 return build_name 2733 call = self.build_call('latestbuild', branch=branch, target=target, 2734 os_type='android') 2735 translated_build_id = self.run_call(call) 2736 translated_build = (ANDROID_BUILD_NAME_PATTERN % 2737 {'branch': branch, 2738 'target': target, 2739 'build_id': translated_build_id}) 2740 logging.debug('Translated relative build %s to %s', build_name, 2741 translated_build) 2742 return translated_build 2743 2744 2745def _is_load_healthy(load): 2746 """Check if devserver's load meets the minimum threshold. 2747 2748 @param load: The devserver's load stats to check. 2749 2750 @return: True if the load meets the minimum threshold. Return False 2751 otherwise. 2752 2753 """ 2754 # Threshold checks, including CPU load. 2755 if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD: 2756 logging.debug('CPU load of devserver %s is at %s%%, which is higher ' 2757 'than the threshold of %s%%', load['devserver'], 2758 load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD) 2759 return False 2760 if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO: 2761 logging.debug('Network IO of devserver %s is at %i Bps, which is ' 2762 'higher than the threshold of %i bytes per second.', 2763 load['devserver'], load[DevServer.NETWORK_IO], 2764 DevServer.MAX_NETWORK_IO) 2765 return False 2766 return True 2767 2768 2769def _compare_load(devserver1, devserver2): 2770 """Comparator function to compare load between two devservers. 2771 2772 @param devserver1: A dictionary of devserver load stats to be compared. 2773 @param devserver2: A dictionary of devserver load stats to be compared. 2774 2775 @return: Negative value if the load of `devserver1` is less than the load 2776 of `devserver2`. Return positive value otherwise. 2777 2778 """ 2779 return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO]) 2780 2781 2782def _get_subnet_for_host_ip(host_ip, 2783 restricted_subnets=utils.RESTRICTED_SUBNETS): 2784 """Get the subnet for a given host IP. 2785 2786 @param host_ip: the IP of a DUT. 2787 @param restricted_subnets: A list of restriected subnets. 2788 2789 @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the 2790 host_ip, return (None, None). 2791 """ 2792 for subnet_ip, mask_bits in restricted_subnets: 2793 if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits): 2794 return subnet_ip, mask_bits 2795 2796 return None, None 2797 2798 2799def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None): 2800 """Get the devserver with the least load. 2801 2802 Iterate through all devservers and get the one with least load. 2803 2804 TODO(crbug.com/486278): Devserver with required build already staged should 2805 take higher priority. This will need check_health call to be able to verify 2806 existence of a given build/artifact. Also, in case all devservers are 2807 overloaded, the logic here should fall back to the old behavior that randomly 2808 selects a devserver based on the hash of the image name/url. 2809 2810 @param devserver_type: Type of devserver to select from. Default is set to 2811 ImageServer. 2812 @param hostname: Hostname of the dut that the devserver is used for. The 2813 picked devserver needs to respect the location of the host if 2814 `prefer_local_devserver` is set to True or `restricted_subnets` is 2815 set. 2816 2817 @return: Name of the devserver with the least load. 2818 2819 """ 2820 logging.debug('Get the least loaded %r', devserver_type) 2821 devservers, can_retry = devserver_type.get_available_devservers( 2822 hostname) 2823 # If no healthy devservers available and can_retry is False, return None. 2824 # Otherwise, relax the constrain on hostname, allow all devservers to be 2825 # available. 2826 if not devserver_type.get_healthy_devserver('', devservers): 2827 if not can_retry: 2828 return None 2829 else: 2830 devservers, _ = devserver_type.get_available_devservers() 2831 2832 # get_devserver_load call needs to be made in a new process to allow force 2833 # timeout using signal. 2834 output = multiprocessing.Queue() 2835 processes = [] 2836 for devserver in devservers: 2837 processes.append(multiprocessing.Process( 2838 target=devserver_type.get_devserver_load_wrapper, 2839 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output))) 2840 2841 for p in processes: 2842 p.start() 2843 for p in processes: 2844 # The timeout for the process commands aren't reliable. Add 2845 # some extra time to the timeout for potential overhead in the 2846 # subprocesses. crbug.com/913695 2847 p.join(TIMEOUT_GET_DEVSERVER_LOAD + 10) 2848 # Read queue before killing processes to avoid corrupting the queue. 2849 loads = [output.get() for p in processes if not p.is_alive()] 2850 for p in processes: 2851 if p.is_alive(): 2852 p.terminate() 2853 # Filter out any load failed to be retrieved or does not support load check. 2854 loads = [load for load in loads if load and DevServer.CPU_LOAD in load and 2855 DevServer.is_free_disk_ok(load) and 2856 DevServer.is_apache_client_count_ok(load)] 2857 if not loads: 2858 logging.debug('Failed to retrieve load stats from any devserver. No ' 2859 'load balancing can be applied.') 2860 return None 2861 loads = [load for load in loads if _is_load_healthy(load)] 2862 if not loads: 2863 logging.error('No devserver has the capacity to be selected.') 2864 return None 2865 loads = sorted(loads, cmp=_compare_load) 2866 return loads[0]['devserver'] 2867 2868 2869def resolve(build, hostname=None, ban_list=None): 2870 """Resolve a devserver can be used for given build and hostname. 2871 2872 @param build: Name of a build to stage on devserver, e.g., 2873 ChromeOS build: daisy-release/R50-1234.0.0 2874 Launch Control build: git_mnc_release/shamu-eng 2875 @param hostname: Hostname of a devserver for, default is None, which means 2876 devserver is not restricted by the network location of the host. 2877 @param ban_list: The blacklist of devservers shouldn't be chosen. 2878 2879 @return: A DevServer instance that can be used to stage given build for the 2880 given host. 2881 """ 2882 if utils.is_launch_control_build(build): 2883 return AndroidBuildServer.resolve(build, hostname) 2884 else: 2885 return ImageServer.resolve(build, hostname, ban_list=ban_list) 2886