1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from distutils import version 6import cStringIO 7import HTMLParser 8import httplib 9import json 10import logging 11import multiprocessing 12import os 13import re 14import socket 15import time 16import urllib2 17import urlparse 18 19from autotest_lib.client.bin import utils as bin_utils 20from autotest_lib.client.common_lib import android_utils 21from autotest_lib.client.common_lib import error 22from autotest_lib.client.common_lib import global_config 23from autotest_lib.client.common_lib import utils 24from autotest_lib.client.common_lib.cros import retry 25from autotest_lib.server import utils as server_utils 26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107 27 28try: 29 from chromite.lib import metrics 30except ImportError: 31 metrics = utils.metrics_mock 32 33 34CONFIG = global_config.global_config 35# This file is generated at build time and specifies, per suite and per test, 36# the DEPENDENCIES list specified in each control file. It's a dict of dicts: 37# {'bvt': {'/path/to/autotest/control/site_tests/test1/control': ['dep1']} 38# 'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']} 39# 'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'], 40# '/path/to/autotest/control/site_tests/test3/control': ['dep3']} 41# } 42DEPENDENCIES_FILE = 'test_suites/dependency_info' 43# Number of seconds for caller to poll devserver's is_staged call to check if 44# artifacts are staged. 45_ARTIFACT_STAGE_POLLING_INTERVAL = 5 46# Artifacts that should be staged when client calls devserver RPC to stage an 47# image. 48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful' 49# Artifacts that should be staged when client calls devserver RPC to stage an 50# image with autotest artifact. 51_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,' 52 'control_files,stateful,' 53 'autotest_packages') 54# Artifacts that should be staged when client calls devserver RPC to stage an 55# Android build. 56_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions') 57SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value( 58 'CROS', 'skip_devserver_health_check', type=bool) 59# Number of seconds for the call to get devserver load to time out. 60TIMEOUT_GET_DEVSERVER_LOAD = 2.0 61 62# Android artifact path in devserver 63ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value( 64 'CROS', 'android_build_name_pattern', type=str).replace('\\', '') 65 66# Return value from a devserver RPC indicating the call succeeded. 67SUCCESS = 'Success' 68 69# The timeout minutes for a given devserver ssh call. 70DEVSERVER_SSH_TIMEOUT_MINS = 1 71 72# Error message for invalid devserver response. 73ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error' 74ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable' 75 76# Error message for devserver call timedout. 77ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout' 78 79# The timeout minutes for waiting a devserver staging. 80DEVSERVER_IS_STAGING_RETRY_MIN = 100 81 82# The timeout minutes for waiting a DUT auto-update finished. 83DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100 84 85# The total times of devserver triggering CrOS auto-update. 86AU_RETRY_LIMIT = 2 87 88# Number of seconds for caller to poll devserver's get_au_status call to 89# check if cros auto-update is finished. 90CROS_AU_POLLING_INTERVAL = 10 91 92# Number of seconds for intervals between retrying auto-update calls. 93CROS_AU_RETRY_INTERVAL = 20 94 95# The file name for auto-update logs. 96CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log' 97 98# Provision error patterns. 99# People who see this should know that they shouldn't change these 100# classification strings. These strings are used for monitoring provision 101# failures. Any changes may mess up the stats. 102_EXCEPTION_PATTERNS = [ 103 # Raised when devserver portfile does not exist on host. 104 (r".*Devserver portfile does not exist!.*$", 105 '(1) Devserver portfile does not exist on host'), 106 # Raised when devserver cannot copy packages to host. 107 (r".*Could not copy .* to device.*$", 108 '(2) Cannot copy packages to host'), 109 # Raised when devserver fails to run specific commands on host. 110 (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$", 111 '(3) Fail to run specific command on host'), 112 # Raised when new build fails to boot on the host. 113 (r'.*RootfsUpdateError: Build .* failed to boot on.*$', 114 '(4) Build failed to boot on host'), 115 # Raised when the auto-update process is timed out. 116 (r'.*The CrOS auto-update process is timed out, ' 117 'thus will be terminated.*$', 118 '(5) Auto-update is timed out'), 119 # Raised when the host is not pingable. 120 (r".*DeviceNotPingableError.*$", 121 '(6) Host is not pingable during auto-update'), 122 # Raised when hosts have unexpected status after rootfs update. 123 (r'.*Update failed with unexpected update status: ' 124 'UPDATE_STATUS_IDLE.*$', 125 '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs ' 126 'update'), 127 # Raised when devserver returns non-json response to shard/drone. 128 (r'.*No JSON object could be decoded.*$', 129 '(8) Devserver returned non-json object'), 130 # Raised when devserver loses host's ssh connection 131 (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$', 132 "(9) Devserver lost host's ssh connection"), 133 # Raised when error happens in writing files to host 134 (r'.*Write failed\: Broken pipe.*$', 135 "(10) Broken pipe while writing or connecting to host")] 136 137PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value( 138 'CROS', 'prefer_local_devserver', type=bool, default=False) 139 140ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value( 141 'CROS', 'enable_ssh_connection_for_devserver', type=bool, 142 default=False) 143 144# Directory to save auto-update logs 145AUTO_UPDATE_LOG_DIR = 'autoupdate_logs' 146 147DEFAULT_SUBNET_MASKBIT = 19 148 149# Metrics basepaths. 150METRICS_PATH = 'chromeos/autotest' 151PROVISION_PATH = METRICS_PATH + '/provision' 152 153 154class DevServerException(Exception): 155 """Raised when the dev server returns a non-200 HTTP response.""" 156 pass 157 158 159class BadBuildException(DevServerException): 160 """Raised when build failed to boot on DUT.""" 161 pass 162 163 164class RetryableProvisionException(DevServerException): 165 """Raised when provision fails due to a retryable reason.""" 166 pass 167 168class DevServerOverloadException(Exception): 169 """Raised when the dev server returns a 502 HTTP response.""" 170 pass 171 172class DevServerFailToLocateException(Exception): 173 """Raised when fail to locate any devserver.""" 174 pass 175 176 177class DevServerExceptionClassifier(object): 178 """A Class represents exceptions raised from DUT by calling auto_update.""" 179 def __init__(self, err, keep_full_trace=True): 180 """ 181 @param err: A single string representing one time provision 182 error happened in auto_update(). 183 @param keep_full_trace: True to keep the whole track trace of error. 184 False when just keep the last line. 185 """ 186 self._err = err if keep_full_trace else err.split('\n')[-1] 187 self._classification = None 188 189 def _classify(self): 190 for err_pattern, classification in _EXCEPTION_PATTERNS: 191 if re.match(err_pattern, self._err): 192 return classification 193 194 return '(0) Unknown exception' 195 196 @property 197 def classification(self): 198 """Classify the error 199 200 @return: return a classified exception type (string) from 201 _EXCEPTION_PATTERNS or 'Unknown exception'. Current patterns in 202 _EXCEPTION_PATTERNS are very specific so that errors cannot match 203 more than one pattern. 204 """ 205 if not self._classification: 206 self._classification = self._classify() 207 return self._classification 208 209 @property 210 def summary(self): 211 """Use one line to show the error message.""" 212 return ' '.join(self._err.splitlines()) 213 214 @property 215 def classified_exception(self): 216 """What kind of exception will be raised to higher. 217 218 @return: return a special Exception when the raised error is an 219 RootfsUpdateError. Otherwise, return general DevServerException. 220 """ 221 # The classification of RootfsUpdateError in _EXCEPTION_PATTERNS starts 222 # with "(4)" 223 if self.classification.startswith('(4)'): 224 return BadBuildException 225 226 return DevServerException 227 228 229class MarkupStripper(HTMLParser.HTMLParser): 230 """HTML parser that strips HTML tags, coded characters like & 231 232 Works by, basically, not doing anything for any tags, and only recording 233 the content of text nodes in an internal data structure. 234 """ 235 def __init__(self): 236 self.reset() 237 self.fed = [] 238 239 240 def handle_data(self, d): 241 """Consume content of text nodes, store it away.""" 242 self.fed.append(d) 243 244 245 def get_data(self): 246 """Concatenate and return all stored data.""" 247 return ''.join(self.fed) 248 249 250def _strip_http_message(message): 251 """Strip the HTTP marker from the an HTTP message. 252 253 @param message: A string returned by an HTTP call. 254 255 @return: A string with HTTP marker being stripped. 256 """ 257 strip = MarkupStripper() 258 try: 259 strip.feed(message.decode('utf_32')) 260 except UnicodeDecodeError: 261 strip.feed(message) 262 return strip.get_data() 263 264 265def _get_image_storage_server(): 266 return CONFIG.get_config_value('CROS', 'image_storage_server', type=str) 267 268 269def _get_canary_channel_server(): 270 """ 271 Get the url of the canary-channel server, 272 eg: gsutil://chromeos-releases/canary-channel/<board>/<release> 273 274 @return: The url to the canary channel server. 275 """ 276 return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str) 277 278 279def _get_storage_server_for_artifacts(artifacts=None): 280 """Gets the appropriate storage server for the given artifacts. 281 282 @param artifacts: A list of artifacts we need to stage. 283 @return: The address of the storage server that has these artifacts. 284 The default image storage server if no artifacts are specified. 285 """ 286 factory_artifact = global_config.global_config.get_config_value( 287 'CROS', 'factory_artifact', type=str, default='') 288 if artifacts and factory_artifact and factory_artifact in artifacts: 289 return _get_canary_channel_server() 290 return _get_image_storage_server() 291 292 293def _gs_or_local_archive_url_args(archive_url): 294 """Infer the devserver call arguments to use with the given archive_url. 295 296 @param archive_url: The archive url to include the in devserver RPC. This 297 can either e a GS path or a local path. 298 @return: A dict of arguments to include in the devserver call. 299 """ 300 if not archive_url: 301 return {} 302 elif archive_url.startswith('gs://'): 303 return {'archive_url': archive_url} 304 else: 305 # For a local path, we direct the devserver to move the files while 306 # staging. This is the fastest way to stage local files, but deletes the 307 # files from the source. This is OK because the files are available on 308 # the devserver once staged. 309 return { 310 'local_path': archive_url, 311 'delete_source': True, 312 } 313 314 315def _reverse_lookup_from_config(address): 316 """Look up hostname for the given IP address. 317 318 This uses the hostname-address map from the config file. 319 320 If multiple hostnames map to the same IP address, the first one 321 defined in the configuration file takes precedence. 322 323 @param address: IP address string 324 @returns: hostname string, or original input if not found 325 """ 326 for hostname, addr in _get_hostname_addr_map().iteritems(): 327 if addr == address: 328 return hostname 329 return address 330 331 332def _get_hostname_addr_map(): 333 """Get hostname address mapping from config. 334 335 @return: dict mapping server hostnames to addresses 336 """ 337 return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP') 338 339 340def _get_dev_server_list(): 341 return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[]) 342 343 344def _get_crash_server_list(): 345 return CONFIG.get_config_value('CROS', 'crash_server', type=list, 346 default=[]) 347 348 349def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN, 350 exception_to_raise=DevServerException): 351 """A decorator to use with remote devserver calls. 352 353 This decorator converts urllib2.HTTPErrors into DevServerExceptions 354 with any embedded error info converted into plain text. The method 355 retries on urllib2.URLError or error.CmdError to avoid devserver flakiness. 356 """ 357 #pylint: disable=C0111 358 359 def inner_decorator(method): 360 label = method.__name__ if hasattr(method, '__name__') else None 361 def metrics_wrapper(*args, **kwargs): 362 @retry.retry((urllib2.URLError, error.CmdError, 363 DevServerOverloadException), 364 timeout_min=timeout_min, 365 exception_to_raise=exception_to_raise, 366 label=label) 367 def wrapper(): 368 """This wrapper actually catches the HTTPError.""" 369 try: 370 return method(*args, **kwargs) 371 except urllib2.HTTPError as e: 372 error_markup = e.read() 373 raise DevServerException(_strip_http_message(error_markup)) 374 375 try: 376 return wrapper() 377 except Exception as e: 378 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e): 379 dev_server = None 380 if args and isinstance(args[0], DevServer): 381 dev_server = args[0].hostname 382 elif 'devserver' in kwargs: 383 dev_server = get_hostname(kwargs['devserver']) 384 385 logging.debug('RPC call %s has timed out on devserver %s.', 386 label, dev_server) 387 c = metrics.Counter( 388 'chromeos/autotest/devserver/call_timeout') 389 c.increment(fields={'dev_server': dev_server, 390 'healthy': label}) 391 392 raise 393 394 return metrics_wrapper 395 396 return inner_decorator 397 398 399def get_hostname(url): 400 """Get the hostname portion of a URL 401 402 schema://hostname:port/path 403 404 @param url: a Url string 405 @return: a hostname string 406 """ 407 return urlparse.urlparse(url).hostname 408 409 410def get_resolved_hostname(url): 411 """Get the symbolic hostname from url. 412 413 If the given `url` uses a numeric IP address, try and find a 414 symbolic name from the hostname map in the config file. 415 416 @param url The URL with which to perform the conversion/lookup. 417 """ 418 return _reverse_lookup_from_config(get_hostname(url)) 419 420 421class DevServer(object): 422 """Base class for all DevServer-like server stubs. 423 424 This is the base class for interacting with all Dev Server-like servers. 425 A caller should instantiate a sub-class of DevServer with: 426 427 host = SubClassServer.resolve(build) 428 server = SubClassServer(host) 429 """ 430 _MIN_FREE_DISK_SPACE_GB = 20 431 _MAX_APACHE_CLIENT_COUNT = 75 432 # Threshold for the CPU load percentage for a devserver to be selected. 433 MAX_CPU_LOAD = 80.0 434 # Threshold for the network IO, set to 80MB/s 435 MAX_NETWORK_IO = 1024 * 1024 * 80 436 DISK_IO = 'disk_total_bytes_per_second' 437 NETWORK_IO = 'network_total_bytes_per_second' 438 CPU_LOAD = 'cpu_percent' 439 FREE_DISK = 'free_disk' 440 AU_PROCESS = 'au_process_count' 441 STAGING_THREAD_COUNT = 'staging_thread_count' 442 APACHE_CLIENT_COUNT = 'apache_client_count' 443 444 445 def __init__(self, devserver): 446 self._devserver = devserver 447 448 449 def url(self): 450 """Returns the url for this devserver.""" 451 return self._devserver 452 453 454 @property 455 def hostname(self): 456 """Return devserver hostname parsed from the devserver URL. 457 458 Note that this is likely parsed from the devserver URL from 459 shadow_config.ini, meaning that the "hostname" part of the 460 devserver URL is actually an IP address. 461 462 @return hostname string 463 """ 464 return get_hostname(self.url()) 465 466 467 @property 468 def resolved_hostname(self): 469 """Return devserver hostname, resolved from its IP address. 470 471 Unlike the hostname property, this property attempts to look up 472 the proper hostname from the devserver IP address. If lookup 473 fails, then fall back to whatever the hostname property would 474 have returned. 475 476 @return hostname string 477 """ 478 return _reverse_lookup_from_config(self.hostname) 479 480 481 @staticmethod 482 def get_server_url(url): 483 """Get the devserver url from a repo url, which includes build info. 484 485 @param url: A job repo url. 486 487 @return A devserver url, e.g., http://127.0.0.10:8080 488 """ 489 res = urlparse.urlparse(url) 490 if res.netloc: 491 return res.scheme + '://' + res.netloc 492 493 494 @classmethod 495 def get_devserver_load_wrapper(cls, devserver, timeout_sec, output): 496 """A wrapper function to call get_devserver_load in parallel. 497 498 @param devserver: url of the devserver. 499 @param timeout_sec: Number of seconds before time out the devserver 500 call. 501 @param output: An output queue to save results to. 502 """ 503 load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0) 504 if load: 505 load['devserver'] = devserver 506 output.put(load) 507 508 509 @classmethod 510 def get_devserver_load(cls, devserver, 511 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 512 """Returns True if the |devserver| is healthy to stage build. 513 514 @param devserver: url of the devserver. 515 @param timeout_min: How long to wait in minutes before deciding the 516 the devserver is not up (float). 517 518 @return: A dictionary of the devserver's load. 519 520 """ 521 call = cls._build_call(devserver, 'check_health') 522 @remote_devserver_call(timeout_min=timeout_min) 523 def get_load(devserver=devserver): 524 """Inner method that makes the call.""" 525 return cls.run_call(call, timeout=timeout_min*60) 526 527 try: 528 return json.load(cStringIO.StringIO(get_load(devserver=devserver))) 529 except Exception as e: 530 logging.error('Devserver call failed: "%s", timeout: %s seconds,' 531 ' Error: %s', call, timeout_min * 60, e) 532 533 534 @classmethod 535 def is_free_disk_ok(cls, load): 536 """Check if a devserver has enough free disk. 537 538 @param load: A dict of the load of the devserver. 539 540 @return: True if the devserver has enough free disk or disk check is 541 skipped in global config. 542 543 """ 544 if SKIP_DEVSERVER_HEALTH_CHECK: 545 logging.debug('devserver health check is skipped.') 546 elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB: 547 return False 548 549 return True 550 551 552 @classmethod 553 def is_apache_client_count_ok(cls, load): 554 """Check if a devserver has enough Apache connections available. 555 556 Apache server by default has maximum of 150 concurrent connections. If 557 a devserver has too many live connections, it likely indicates the 558 server is busy handling many long running download requests, e.g., 559 downloading stateful partitions. It is better not to add more requests 560 to it. 561 562 @param load: A dict of the load of the devserver. 563 564 @return: True if the devserver has enough Apache connections available, 565 or disk check is skipped in global config. 566 567 """ 568 if SKIP_DEVSERVER_HEALTH_CHECK: 569 logging.debug('devserver health check is skipped.') 570 elif cls.APACHE_CLIENT_COUNT not in load: 571 logging.debug('Apache client count is not collected from devserver.') 572 elif (load[cls.APACHE_CLIENT_COUNT] > 573 cls._MAX_APACHE_CLIENT_COUNT): 574 return False 575 576 return True 577 578 579 @classmethod 580 def devserver_healthy(cls, devserver, 581 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 582 """Returns True if the |devserver| is healthy to stage build. 583 584 @param devserver: url of the devserver. 585 @param timeout_min: How long to wait in minutes before deciding the 586 the devserver is not up (float). 587 588 @return: True if devserver is healthy. Return False otherwise. 589 590 """ 591 c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy') 592 reason = '' 593 healthy = False 594 load = cls.get_devserver_load(devserver, timeout_min=timeout_min) 595 try: 596 if not load: 597 # Failed to get the load of devserver. 598 reason = '(1) Failed to get load.' 599 return False 600 601 apache_ok = cls.is_apache_client_count_ok(load) 602 if not apache_ok: 603 reason = '(2) Apache client count too high.' 604 logging.error('Devserver check_health failed. Live Apache client ' 605 'count is too high: %d.', 606 load[cls.APACHE_CLIENT_COUNT]) 607 return False 608 609 disk_ok = cls.is_free_disk_ok(load) 610 if not disk_ok: 611 reason = '(3) Disk space too low.' 612 logging.error('Devserver check_health failed. Free disk space is ' 613 'low. Only %dGB is available.', 614 load[cls.FREE_DISK]) 615 healthy = bool(disk_ok) 616 return disk_ok 617 finally: 618 c.increment(fields={'dev_server': cls(devserver).resolved_hostname, 619 'healthy': healthy, 620 'reason': reason}) 621 # Monitor how many AU processes the devserver is currently running. 622 if load is not None and load.get(DevServer.AU_PROCESS): 623 c_au = metrics.Gauge( 624 'chromeos/autotest/devserver/devserver_au_count') 625 c_au.set( 626 load.get(DevServer.AU_PROCESS), 627 fields={'dev_server': cls(devserver).resolved_hostname}) 628 629 630 @staticmethod 631 def _build_call(host, method, **kwargs): 632 """Build a URL to |host| that calls |method|, passing |kwargs|. 633 634 Builds a URL that calls |method| on the dev server defined by |host|, 635 passing a set of key/value pairs built from the dict |kwargs|. 636 637 @param host: a string that is the host basename e.g. http://server:90. 638 @param method: the dev server method to call. 639 @param kwargs: a dict mapping arg names to arg values. 640 @return the URL string. 641 """ 642 # If the archive_url is a local path, the args expected by the devserver 643 # are a little different. 644 archive_url_args = _gs_or_local_archive_url_args( 645 kwargs.pop('archive_url', None)) 646 kwargs.update(archive_url_args) 647 648 argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems())) 649 return "%(host)s/%(method)s?%(argstr)s" % dict( 650 host=host, method=method, argstr=argstr) 651 652 653 def build_call(self, method, **kwargs): 654 """Builds a devserver RPC string that is used by 'run_call()'. 655 656 @param method: remote devserver method to call. 657 """ 658 return self._build_call(self._devserver, method, **kwargs) 659 660 661 @classmethod 662 def build_all_calls(cls, method, **kwargs): 663 """Builds a list of URLs that makes RPC calls on all devservers. 664 665 Build a URL that calls |method| on the dev server, passing a set 666 of key/value pairs built from the dict |kwargs|. 667 668 @param method: the dev server method to call. 669 @param kwargs: a dict mapping arg names to arg values 670 671 @return the URL string 672 """ 673 calls = [] 674 # Note we use cls.servers as servers is class specific. 675 for server in cls.servers(): 676 if cls.devserver_healthy(server): 677 calls.append(cls._build_call(server, method, **kwargs)) 678 679 return calls 680 681 682 @classmethod 683 def run_call(cls, call, readline=False, timeout=None): 684 """Invoke a given devserver call using urllib.open. 685 686 Open the URL with HTTP, and return the text of the response. Exceptions 687 may be raised as for urllib2.urlopen(). 688 689 @param call: a url string that calls a method to a devserver. 690 @param readline: whether read http response line by line. 691 @param timeout: The timeout seconds for this urlopen call. 692 693 @return the results of this call. 694 """ 695 if timeout is not None: 696 return utils.urlopen_socket_timeout( 697 call, timeout=timeout).read() 698 elif readline: 699 response = urllib2.urlopen(call) 700 return [line.rstrip() for line in response] 701 else: 702 return urllib2.urlopen(call).read() 703 704 705 @staticmethod 706 def servers(): 707 """Returns a list of servers that can serve as this type of server.""" 708 raise NotImplementedError() 709 710 711 @classmethod 712 def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT, 713 unrestricted_only=False): 714 """Get the devservers in the same subnet of the given ip. 715 716 @param ip: The IP address of a dut to look for devserver. 717 @param mask_bits: Number of mask bits. Default is 19. 718 @param unrestricted_only: Set to True to select from devserver in 719 unrestricted subnet only. Default is False. 720 721 @return: A list of devservers in the same subnet of the given ip. 722 723 """ 724 # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so 725 # we need a dict to return the full devserver path once the IPs are 726 # filtered in get_servers_in_same_subnet. 727 server_names = {} 728 all_devservers = [] 729 devservers = (cls.get_unrestricted_devservers() if unrestricted_only 730 else cls.servers()) 731 for server in devservers: 732 server_name = get_hostname(server) 733 server_names[server_name] = server 734 all_devservers.append(server_name) 735 if not all_devservers: 736 devserver_type = 'unrestricted only' if unrestricted_only else 'all' 737 raise DevServerFailToLocateException( 738 'Fail to locate a devserver for dut %s in %s devservers' 739 % (ip, devserver_type)) 740 741 devservers = utils.get_servers_in_same_subnet(ip, mask_bits, 742 all_devservers) 743 return [server_names[s] for s in devservers] 744 745 746 @classmethod 747 def get_unrestricted_devservers( 748 cls, restricted_subnets=utils.RESTRICTED_SUBNETS): 749 """Get the devservers not in any restricted subnet specified in 750 restricted_subnets. 751 752 @param restricted_subnets: A list of restriected subnets. 753 754 @return: A list of devservers not in any restricted subnet. 755 756 """ 757 if not restricted_subnets: 758 return cls.servers() 759 760 metrics.Counter('chromeos/autotest/devserver/unrestricted_hotfix') 761 return cls.servers() 762 763 @classmethod 764 def get_healthy_devserver(cls, build, devservers, ban_list=None): 765 """"Get a healthy devserver instance from the list of devservers. 766 767 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 768 @param devservers: The devserver list to be chosen out a healthy one. 769 @param ban_list: The blacklist of devservers we don't want to choose. 770 Default is None. 771 772 @return: A DevServer object of a healthy devserver. Return None if no 773 healthy devserver is found. 774 775 """ 776 logging.debug('Pick one healthy devserver from %r', devservers) 777 while devservers: 778 hash_index = hash(build) % len(devservers) 779 devserver = devservers.pop(hash_index) 780 logging.debug('Check health for %s', devserver) 781 if ban_list and devserver in ban_list: 782 continue 783 784 if cls.devserver_healthy(devserver): 785 logging.debug('Pick %s', devserver) 786 return cls(devserver) 787 788 789 @classmethod 790 def get_available_devservers(cls, hostname=None, 791 prefer_local_devserver=PREFER_LOCAL_DEVSERVER, 792 restricted_subnets=utils.RESTRICTED_SUBNETS): 793 """Get devservers in the same subnet of the given hostname. 794 795 @param hostname: Hostname of a DUT to choose devserver for. 796 797 @return: A tuple of (devservers, can_retry), devservers is a list of 798 devservers that's available for the given hostname. can_retry 799 is a flag that indicate if caller can retry the selection of 800 devserver if no devserver in the returned devservers can be 801 used. For example, if hostname is in a restricted subnet, 802 can_retry will be False. 803 """ 804 logging.info('Getting devservers for host: %s', hostname) 805 host_ip = None 806 if hostname: 807 host_ip = bin_utils.get_ip_address(hostname) 808 if not host_ip: 809 logging.error('Failed to get IP address of %s. Will pick a ' 810 'devserver without subnet constraint.', hostname) 811 812 if not host_ip: 813 return cls.get_unrestricted_devservers(restricted_subnets), False 814 815 # Go through all restricted subnet settings and check if the DUT is 816 # inside a restricted subnet. If so, only return the devservers in the 817 # restricted subnet and doesn't allow retry. 818 if host_ip and restricted_subnets: 819 subnet_ip, mask_bits = _get_subnet_for_host_ip( 820 host_ip, restricted_subnets=restricted_subnets) 821 if subnet_ip: 822 logging.debug('The host %s (%s) is in a restricted subnet. ' 823 'Try to locate a devserver inside subnet ' 824 '%s:%d.', hostname, host_ip, subnet_ip, 825 mask_bits) 826 devservers = cls.get_devservers_in_same_subnet( 827 subnet_ip, mask_bits) 828 return devservers, False 829 830 # If prefer_local_devserver is set to True and the host is not in 831 # restricted subnet, pick a devserver in the same subnet if possible. 832 # Set can_retry to True so it can pick a different devserver if all 833 # devservers in the same subnet are down. 834 if prefer_local_devserver: 835 return (cls.get_devservers_in_same_subnet( 836 host_ip, DEFAULT_SUBNET_MASKBIT, True), True) 837 838 return cls.get_unrestricted_devservers(restricted_subnets), False 839 840 841 @classmethod 842 def resolve(cls, build, hostname=None, ban_list=None): 843 """"Resolves a build to a devserver instance. 844 845 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 846 @param hostname: The hostname of dut that requests a devserver. It's 847 used to make sure a devserver in the same subnet is 848 preferred. 849 @param ban_list: The blacklist of devservers shouldn't be chosen. 850 851 @raise DevServerException: If no devserver is available. 852 """ 853 tried_devservers = set() 854 devservers, can_retry = cls.get_available_devservers(hostname) 855 if devservers: 856 tried_devservers |= set(devservers) 857 858 devserver = cls.get_healthy_devserver(build, devservers, 859 ban_list=ban_list) 860 861 if not devserver and can_retry: 862 # Find available devservers without dut location constrain. 863 devservers, _ = cls.get_available_devservers() 864 devserver = cls.get_healthy_devserver(build, devservers, 865 ban_list=ban_list) 866 if devservers: 867 tried_devservers |= set(devservers) 868 if devserver: 869 return devserver 870 else: 871 subnet = 'unrestricted subnet' 872 if hostname is not None: 873 host_ip = bin_utils.get_ip_address(hostname) 874 if host_ip: 875 subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip) 876 subnet = '%s/%s' % (str(subnet_ip), str(mask_bits)) 877 878 error_msg = ('All devservers in subnet: %s are currently down: ' 879 '%s. (dut hostname: %s)' % 880 (subnet, tried_devservers, hostname)) 881 logging.error(error_msg) 882 c = metrics.Counter( 883 'chromeos/autotest/devserver/subnet_without_devservers') 884 c.increment(fields={'subnet': subnet, 'hostname': str(hostname)}) 885 raise DevServerException(error_msg) 886 887 888 @classmethod 889 def random(cls): 890 """Return a random devserver that's available. 891 892 Devserver election in `resolve` method is based on a hash of the 893 build that a caller wants to stage. The purpose is that different 894 callers requesting for the same build can get the same devserver, 895 while the lab is able to distribute different builds across all 896 devservers. That helps to reduce the duplication of builds across 897 all devservers. 898 This function returns a random devserver, by passing a random 899 pseudo build name to `resolve `method. 900 """ 901 return cls.resolve(build=str(time.time())) 902 903 904class CrashServer(DevServer): 905 """Class of DevServer that symbolicates crash dumps.""" 906 907 @staticmethod 908 def servers(): 909 return _get_crash_server_list() 910 911 912 @remote_devserver_call() 913 def symbolicate_dump(self, minidump_path, build): 914 """Ask the devserver to symbolicate the dump at minidump_path. 915 916 Stage the debug symbols for |build| and, if that works, ask the 917 devserver to symbolicate the dump at |minidump_path|. 918 919 @param minidump_path: the on-disk path of the minidump. 920 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 921 whose debug symbols are needed for symbolication. 922 @return The contents of the stack trace 923 @raise DevServerException upon any return code that's not HTTP OK. 924 """ 925 try: 926 import requests 927 except ImportError: 928 logging.warning("Can't 'import requests' to connect to dev server.") 929 return '' 930 f = {'dev_server': self.resolved_hostname} 931 c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump') 932 c.increment(fields=f) 933 # Symbolicate minidump. 934 m = 'chromeos/autotest/crashserver/symbolicate_dump_duration' 935 with metrics.SecondsTimer(m, fields=f): 936 call = self.build_call('symbolicate_dump', 937 archive_url=_get_image_storage_server() + build) 938 request = requests.post( 939 call, files={'minidump': open(minidump_path, 'rb')}) 940 if request.status_code == requests.codes.OK: 941 return request.text 942 943 error_fd = cStringIO.StringIO(request.text) 944 raise urllib2.HTTPError( 945 call, request.status_code, request.text, request.headers, 946 error_fd) 947 948 949 @classmethod 950 def get_available_devservers(cls, hostname): 951 """Get all available crash servers. 952 953 Crash server election doesn't need to count the location of hostname. 954 955 @param hostname: Hostname of a DUT to choose devserver for. 956 957 @return: A tuple of (all crash servers, False). can_retry is set to 958 False, as all crash servers are returned. There is no point to 959 retry. 960 """ 961 return cls.servers(), False 962 963 964class ImageServerBase(DevServer): 965 """Base class for devservers used to stage builds. 966 967 CrOS and Android builds are staged in different ways as they have different 968 sets of artifacts. This base class abstracts the shared functions between 969 the two types of ImageServer. 970 """ 971 972 @classmethod 973 def servers(cls): 974 """Returns a list of servers that can serve as a desired type of 975 devserver. 976 """ 977 return _get_dev_server_list() 978 979 980 def _get_image_url(self, image): 981 """Returns the url of the directory for this image on the devserver. 982 983 @param image: the image that was fetched. 984 """ 985 image = self.translate(image) 986 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 987 type=str) 988 return (url_pattern % (self.url(), image)).replace('update', 'static') 989 990 991 @staticmethod 992 def create_metadata(server_name, image, artifacts=None, files=None): 993 """Create a metadata dictionary given the staged items. 994 995 The metadata can be send to metadata db along with stats. 996 997 @param server_name: name of the devserver, e.g 172.22.33.44. 998 @param image: The name of the image. 999 @param artifacts: A list of artifacts. 1000 @param files: A list of files. 1001 1002 @return A metadata dictionary. 1003 1004 """ 1005 metadata = {'devserver': server_name, 1006 'image': image, 1007 '_type': 'devserver'} 1008 if artifacts: 1009 metadata['artifacts'] = ' '.join(artifacts) 1010 if files: 1011 metadata['files'] = ' '.join(files) 1012 return metadata 1013 1014 1015 @classmethod 1016 def run_ssh_call(cls, call, readline=False, timeout=None): 1017 """Construct an ssh-based rpc call, and execute it. 1018 1019 @param call: a url string that calls a method to a devserver. 1020 @param readline: whether read http response line by line. 1021 @param timeout: The timeout seconds for ssh call. 1022 1023 @return the results of this call. 1024 """ 1025 hostname = get_hostname(call) 1026 ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call)) 1027 timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60 1028 try: 1029 result = utils.run(ssh_call, timeout=timeout_seconds) 1030 except error.CmdError as e: 1031 logging.debug('Error occurred with exit_code %d when executing the ' 1032 'ssh call: %s.', e.result_obj.exit_status, 1033 e.result_obj.stderr) 1034 c = metrics.Counter('chromeos/autotest/devserver/ssh_failure') 1035 c.increment(fields={'dev_server': hostname}) 1036 raise 1037 response = result.stdout 1038 1039 # If the curl command's returned HTTP response contains certain 1040 # exception string, raise the DevServerException of the response. 1041 if 'DownloaderException' in response: 1042 raise DevServerException(_strip_http_message(response)) 1043 1044 if readline: 1045 # Remove line terminators and trailing whitespace 1046 response = response.splitlines() 1047 return [line.rstrip() for line in response] 1048 1049 return response 1050 1051 1052 @classmethod 1053 def run_call(cls, call, readline=False, timeout=None): 1054 """Invoke a given devserver call using urllib.open or ssh. 1055 1056 Open the URL with HTTP or SSH-based HTTP, and return the text of the 1057 response. Exceptions may be raised as for urllib2.urlopen() or 1058 utils.run(). 1059 1060 @param call: a url string that calls a method to a devserver. 1061 @param readline: whether read http response line by line. 1062 @param timeout: The timeout seconds for urlopen call or ssh call. 1063 1064 @return the results of this call. 1065 """ 1066 server_name = get_hostname(call) 1067 is_in_restricted_subnet = utils.get_restricted_subnet( 1068 server_name, utils.RESTRICTED_SUBNETS) 1069 _EMPTY_SENTINEL_VALUE = object() 1070 def kickoff_call(): 1071 """Invoke a given devserver call using urllib.open or ssh. 1072 1073 @param call: a url string that calls a method to a devserver. 1074 @param is_in_restricted_subnet: whether the devserver is in subnet. 1075 @param readline: whether read http response line by line. 1076 @param timeout: The timeout seconds for urlopen call or ssh call. 1077 """ 1078 if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or 1079 not is_in_restricted_subnet): 1080 response = super(ImageServerBase, cls).run_call( 1081 call, readline=readline, timeout=timeout) 1082 else: 1083 response = cls.run_ssh_call( 1084 call, readline=readline, timeout=timeout) 1085 # Retry if devserver service is temporarily down, e.g. in a 1086 # devserver push. 1087 if ERR_MSG_FOR_DOWN_DEVSERVER in response: 1088 return False 1089 1090 # Don't return response directly since it may be empty string, 1091 # which causes poll_for_condition to retry. 1092 return _EMPTY_SENTINEL_VALUE if not response else response 1093 1094 try: 1095 response = bin_utils.poll_for_condition( 1096 kickoff_call, 1097 exception=bin_utils.TimeoutError(), 1098 timeout=60, 1099 sleep_interval=5) 1100 return '' if response is _EMPTY_SENTINEL_VALUE else response 1101 except bin_utils.TimeoutError: 1102 return ERR_MSG_FOR_DOWN_DEVSERVER 1103 1104 1105 @classmethod 1106 def download_file(cls, remote_file, local_file, timeout=None): 1107 """Download file from devserver. 1108 1109 The format of remote_file should be: 1110 http://devserver_ip:8082/static/board/... 1111 1112 @param remote_file: The URL of the file on devserver that need to be 1113 downloaded. 1114 @param local_file: The path of the file saved to local. 1115 @param timeout: The timeout seconds for this call. 1116 """ 1117 response = cls.run_call(remote_file, timeout=timeout) 1118 with open(local_file, 'w') as out_log: 1119 out_log.write(response) 1120 1121 1122 def _poll_is_staged(self, **kwargs): 1123 """Polling devserver.is_staged until all artifacts are staged. 1124 1125 @param kwargs: keyword arguments to make is_staged devserver call. 1126 1127 @return: True if all artifacts are staged in devserver. 1128 """ 1129 call = self.build_call('is_staged', **kwargs) 1130 1131 def all_staged(): 1132 """Call devserver.is_staged rpc to check if all files are staged. 1133 1134 @return: True if all artifacts are staged in devserver. False 1135 otherwise. 1136 @rasies DevServerException, the exception is a wrapper of all 1137 exceptions that were raised when devserver tried to download 1138 the artifacts. devserver raises an HTTPError or a CmdError 1139 when an exception was raised in the code. Such exception 1140 should be re-raised here to stop the caller from waiting. 1141 If the call to devserver failed for connection issue, a 1142 URLError exception is raised, and caller should retry the 1143 call to avoid such network flakiness. 1144 1145 """ 1146 try: 1147 result = self.run_call(call) 1148 logging.debug('whether artifact is staged: %r', result) 1149 return result == 'True' 1150 except urllib2.HTTPError as e: 1151 error_markup = e.read() 1152 raise DevServerException(_strip_http_message(error_markup)) 1153 except urllib2.URLError as e: 1154 # Could be connection issue, retry it. 1155 # For example: <urlopen error [Errno 111] Connection refused> 1156 logging.error('URLError happens in is_stage: %r', e) 1157 return False 1158 except error.CmdError as e: 1159 # Retry if SSH failed to connect to the devserver. 1160 logging.warning('CmdError happens in is_stage: %r, will retry', e) 1161 return False 1162 1163 bin_utils.poll_for_condition( 1164 all_staged, 1165 exception=bin_utils.TimeoutError(), 1166 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60, 1167 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL) 1168 1169 return True 1170 1171 1172 def _call_and_wait(self, call_name, error_message, 1173 expected_response=SUCCESS, **kwargs): 1174 """Helper method to make a urlopen call, and wait for artifacts staged. 1175 1176 @param call_name: name of devserver rpc call. 1177 @param error_message: Error message to be thrown if response does not 1178 match expected_response. 1179 @param expected_response: Expected response from rpc, default to 1180 |Success|. If it's set to None, do not compare 1181 the actual response. Any response is consider 1182 to be good. 1183 @param kwargs: keyword arguments to make is_staged devserver call. 1184 1185 @return: The response from rpc. 1186 @raise DevServerException upon any return code that's expected_response. 1187 1188 """ 1189 call = self.build_call(call_name, async=True, **kwargs) 1190 try: 1191 response = self.run_call(call) 1192 logging.debug('response for RPC: %r', response) 1193 if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response: 1194 logging.debug('Proxy error happens in RPC call, ' 1195 'will retry in 30 seconds') 1196 time.sleep(30) 1197 raise DevServerOverloadException() 1198 except httplib.BadStatusLine as e: 1199 logging.error(e) 1200 raise DevServerException('Received Bad Status line, Devserver %s ' 1201 'might have gone down while handling ' 1202 'the call: %s' % (self.url(), call)) 1203 1204 if expected_response and not response == expected_response: 1205 raise DevServerException(error_message) 1206 1207 # `os_type` is needed in build a devserver call, but not needed for 1208 # wait_for_artifacts_staged, since that method is implemented by 1209 # each ImageServerBase child class. 1210 if 'os_type' in kwargs: 1211 del kwargs['os_type'] 1212 self.wait_for_artifacts_staged(**kwargs) 1213 return response 1214 1215 1216 def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs): 1217 """Tell the devserver to download and stage |artifacts| from |image| 1218 specified by kwargs. 1219 1220 This is the main call point for staging any specific artifacts for a 1221 given build. To see the list of artifacts one can stage see: 1222 1223 ~src/platfrom/dev/artifact_info.py. 1224 1225 This is maintained along with the actual devserver code. 1226 1227 @param artifacts: A list of artifacts. 1228 @param files: A list of files to stage. 1229 @param archive_url: Optional parameter that has the archive_url to stage 1230 this artifact from. Default is specified in autotest config + 1231 image. 1232 @param kwargs: keyword arguments that specify the build information, to 1233 make stage devserver call. 1234 1235 @raise DevServerException upon any return code that's not HTTP OK. 1236 """ 1237 if not archive_url: 1238 archive_url = _get_storage_server_for_artifacts(artifacts) + build 1239 1240 artifacts_arg = ','.join(artifacts) if artifacts else '' 1241 files_arg = ','.join(files) if files else '' 1242 error_message = ("staging %s for %s failed;" 1243 "HTTP OK not accompanied by 'Success'." % 1244 ('artifacts=%s files=%s ' % (artifacts_arg, files_arg), 1245 build)) 1246 1247 staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' % 1248 (build, artifacts, files, archive_url)) 1249 logging.info('Staging artifacts on devserver %s: %s', 1250 self.url(), staging_info) 1251 success = False 1252 try: 1253 arguments = {'archive_url': archive_url, 1254 'artifacts': artifacts_arg, 1255 'files': files_arg} 1256 if kwargs: 1257 arguments.update(kwargs) 1258 f = {'artifacts': artifacts_arg, 1259 'dev_server': self.resolved_hostname} 1260 with metrics.SecondsTimer( 1261 'chromeos/autotest/devserver/stage_artifact_duration', 1262 fields=f): 1263 self.call_and_wait(call_name='stage', error_message=error_message, 1264 **arguments) 1265 logging.info('Finished staging artifacts: %s', staging_info) 1266 success = True 1267 except (bin_utils.TimeoutError, error.TimeoutException): 1268 logging.error('stage_artifacts timed out: %s', staging_info) 1269 raise DevServerException( 1270 'stage_artifacts timed out: %s' % staging_info) 1271 finally: 1272 f = {'success': success, 1273 'artifacts': artifacts_arg, 1274 'dev_server': self.resolved_hostname} 1275 metrics.Counter('chromeos/autotest/devserver/stage_artifact' 1276 ).increment(fields=f) 1277 1278 1279 def call_and_wait(self, *args, **kwargs): 1280 """Helper method to make a urlopen call, and wait for artifacts staged. 1281 1282 This method needs to be overridden in the subclass to implement the 1283 logic to call _call_and_wait. 1284 """ 1285 raise NotImplementedError 1286 1287 1288 def _trigger_download(self, build, artifacts, files, synchronous=True, 1289 **kwargs_build_info): 1290 """Tell the devserver to download and stage image specified in 1291 kwargs_build_info. 1292 1293 Tells the devserver to fetch |image| from the image storage server 1294 named by _get_image_storage_server(). 1295 1296 If |synchronous| is True, waits for the entire download to finish 1297 staging before returning. Otherwise only the artifacts necessary 1298 to start installing images onto DUT's will be staged before returning. 1299 A caller can then call finish_download to guarantee the rest of the 1300 artifacts have finished staging. 1301 1302 @param synchronous: if True, waits until all components of the image are 1303 staged before returning. 1304 @param kwargs_build_info: Dictionary of build information. 1305 For CrOS, it is None as build is the CrOS image name. 1306 For Android, it is {'target': target, 1307 'build_id': build_id, 1308 'branch': branch} 1309 1310 @raise DevServerException upon any return code that's not HTTP OK. 1311 1312 """ 1313 if kwargs_build_info: 1314 archive_url = None 1315 else: 1316 archive_url = _get_image_storage_server() + build 1317 error_message = ("trigger_download for %s failed;" 1318 "HTTP OK not accompanied by 'Success'." % build) 1319 kwargs = {'archive_url': archive_url, 1320 'artifacts': artifacts, 1321 'files': files, 1322 'error_message': error_message} 1323 if kwargs_build_info: 1324 kwargs.update(kwargs_build_info) 1325 1326 logging.info('trigger_download starts for %s', build) 1327 try: 1328 response = self.call_and_wait(call_name='stage', **kwargs) 1329 logging.info('trigger_download finishes for %s', build) 1330 except (bin_utils.TimeoutError, error.TimeoutException): 1331 logging.error('trigger_download timed out for %s.', build) 1332 raise DevServerException( 1333 'trigger_download timed out for %s.' % build) 1334 was_successful = response == SUCCESS 1335 if was_successful and synchronous: 1336 self._finish_download(build, artifacts, files, **kwargs_build_info) 1337 1338 1339 def _finish_download(self, build, artifacts, files, **kwargs_build_info): 1340 """Tell the devserver to finish staging image specified in 1341 kwargs_build_info. 1342 1343 If trigger_download is called with synchronous=False, it will return 1344 before all artifacts have been staged. This method contacts the 1345 devserver and blocks until all staging is completed and should be 1346 called after a call to trigger_download. 1347 1348 @param kwargs_build_info: Dictionary of build information. 1349 For CrOS, it is None as build is the CrOS image name. 1350 For Android, it is {'target': target, 1351 'build_id': build_id, 1352 'branch': branch} 1353 1354 @raise DevServerException upon any return code that's not HTTP OK. 1355 """ 1356 archive_url = _get_image_storage_server() + build 1357 error_message = ("finish_download for %s failed;" 1358 "HTTP OK not accompanied by 'Success'." % build) 1359 kwargs = {'archive_url': archive_url, 1360 'artifacts': artifacts, 1361 'files': files, 1362 'error_message': error_message} 1363 if kwargs_build_info: 1364 kwargs.update(kwargs_build_info) 1365 try: 1366 self.call_and_wait(call_name='stage', **kwargs) 1367 except (bin_utils.TimeoutError, error.TimeoutException): 1368 logging.error('finish_download timed out for %s', build) 1369 raise DevServerException( 1370 'finish_download timed out for %s.' % build) 1371 1372 1373 @remote_devserver_call() 1374 def locate_file(self, file_name, artifacts, build, build_info): 1375 """Locate a file with the given file_name on devserver. 1376 1377 This method calls devserver RPC `locate_file` to look up a file with 1378 the given file name inside specified build artifacts. 1379 1380 @param file_name: Name of the file to look for a file. 1381 @param artifacts: A list of artifact names to search for the file. 1382 @param build: Name of the build. For Android, it's None as build_info 1383 should be used. 1384 @param build_info: Dictionary of build information. 1385 For CrOS, it is None as build is the CrOS image name. 1386 For Android, it is {'target': target, 1387 'build_id': build_id, 1388 'branch': branch} 1389 1390 @return: A devserver url to the file. 1391 @raise DevServerException upon any return code that's not HTTP OK. 1392 """ 1393 if not build and not build_info: 1394 raise DevServerException('You must specify build information to ' 1395 'look for file %s in artifacts %s.' % 1396 (file_name, artifacts)) 1397 kwargs = {'file_name': file_name, 1398 'artifacts': artifacts} 1399 if build_info: 1400 build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info 1401 kwargs.update(build_info) 1402 # Devserver treats Android and Brillo build in the same way as they 1403 # are both retrieved from Launch Control and have similar build 1404 # artifacts. Therefore, os_type for devserver calls is `android` for 1405 # both Android and Brillo builds. 1406 kwargs['os_type'] = 'android' 1407 else: 1408 build_path = build 1409 kwargs['build'] = build 1410 call = self.build_call('locate_file', async=False, **kwargs) 1411 try: 1412 file_path = self.run_call(call) 1413 return os.path.join(self.url(), 'static', build_path, file_path) 1414 except httplib.BadStatusLine as e: 1415 logging.error(e) 1416 raise DevServerException('Received Bad Status line, Devserver %s ' 1417 'might have gone down while handling ' 1418 'the call: %s' % (self.url(), call)) 1419 1420 1421 @remote_devserver_call() 1422 def list_control_files(self, build, suite_name=''): 1423 """Ask the devserver to list all control files for |build|. 1424 1425 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1426 whose control files the caller wants listed. 1427 @param suite_name: The name of the suite for which we require control 1428 files. 1429 @return None on failure, or a list of control file paths 1430 (e.g. server/site_tests/autoupdate/control) 1431 @raise DevServerException upon any return code that's not HTTP OK. 1432 """ 1433 build = self.translate(build) 1434 call = self.build_call('controlfiles', build=build, 1435 suite_name=suite_name) 1436 return self.run_call(call, readline=True) 1437 1438 1439 @remote_devserver_call() 1440 def get_control_file(self, build, control_path): 1441 """Ask the devserver for the contents of a control file. 1442 1443 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1444 whose control file the caller wants to fetch. 1445 @param control_path: The file to fetch 1446 (e.g. server/site_tests/autoupdate/control) 1447 @return The contents of the desired file. 1448 @raise DevServerException upon any return code that's not HTTP OK. 1449 """ 1450 build = self.translate(build) 1451 call = self.build_call('controlfiles', build=build, 1452 control_path=control_path) 1453 return self.run_call(call) 1454 1455 1456 @remote_devserver_call() 1457 def list_suite_controls(self, build, suite_name=''): 1458 """Ask the devserver to list contents of all control files for |build|. 1459 1460 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1461 whose control files' contents the caller wants returned. 1462 @param suite_name: The name of the suite for which we require control 1463 files. 1464 @return None on failure, or a dict of contents of all control files 1465 (e.g. {'path1': "#Copyright controls ***", ..., 1466 pathX': "#Copyright controls ***"} 1467 @raise DevServerException upon any return code that's not HTTP OK. 1468 """ 1469 build = self.translate(build) 1470 call = self.build_call('list_suite_controls', build=build, 1471 suite_name=suite_name) 1472 return json.load(cStringIO.StringIO(self.run_call(call))) 1473 1474 1475class ImageServer(ImageServerBase): 1476 """Class for DevServer that handles RPCs related to CrOS images. 1477 1478 The calls to devserver to stage artifacts, including stage and download, are 1479 made in async mode. That is, when caller makes an RPC |stage| to request 1480 devserver to stage certain artifacts, devserver handles the call and starts 1481 staging artifacts in a new thread, and return |Success| without waiting for 1482 staging being completed. When caller receives message |Success|, it polls 1483 devserver's is_staged call until all artifacts are staged. 1484 Such mechanism is designed to prevent cherrypy threads in devserver being 1485 running out, as staging artifacts might take long time, and cherrypy starts 1486 with a fixed number of threads that handle devserver rpc. 1487 """ 1488 1489 class ArtifactUrls(object): 1490 """A container for URLs of staged artifacts. 1491 1492 Attributes: 1493 full_payload: URL for downloading a staged full release update 1494 mton_payload: URL for downloading a staged M-to-N release update 1495 nton_payload: URL for downloading a staged N-to-N release update 1496 1497 """ 1498 def __init__(self, full_payload=None, mton_payload=None, 1499 nton_payload=None): 1500 self.full_payload = full_payload 1501 self.mton_payload = mton_payload 1502 self.nton_payload = nton_payload 1503 1504 1505 def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''): 1506 """Polling devserver.is_staged until all artifacts are staged. 1507 1508 @param archive_url: Google Storage URL for the build. 1509 @param artifacts: Comma separated list of artifacts to download. 1510 @param files: Comma separated list of files to download. 1511 @return: True if all artifacts are staged in devserver. 1512 """ 1513 kwargs = {'archive_url': archive_url, 1514 'artifacts': artifacts, 1515 'files': files} 1516 return self._poll_is_staged(**kwargs) 1517 1518 1519 @remote_devserver_call() 1520 def call_and_wait(self, call_name, archive_url, artifacts, files, 1521 error_message, expected_response=SUCCESS): 1522 """Helper method to make a urlopen call, and wait for artifacts staged. 1523 1524 @param call_name: name of devserver rpc call. 1525 @param archive_url: Google Storage URL for the build.. 1526 @param artifacts: Comma separated list of artifacts to download. 1527 @param files: Comma separated list of files to download. 1528 @param expected_response: Expected response from rpc, default to 1529 |Success|. If it's set to None, do not compare 1530 the actual response. Any response is consider 1531 to be good. 1532 @param error_message: Error message to be thrown if response does not 1533 match expected_response. 1534 1535 @return: The response from rpc. 1536 @raise DevServerException upon any return code that's expected_response. 1537 1538 """ 1539 kwargs = {'archive_url': archive_url, 1540 'artifacts': artifacts, 1541 'files': files} 1542 return self._call_and_wait(call_name, error_message, 1543 expected_response, **kwargs) 1544 1545 1546 @remote_devserver_call() 1547 def stage_artifacts(self, image=None, artifacts=None, files='', 1548 archive_url=None): 1549 """Tell the devserver to download and stage |artifacts| from |image|. 1550 1551 This is the main call point for staging any specific artifacts for a 1552 given build. To see the list of artifacts one can stage see: 1553 1554 ~src/platfrom/dev/artifact_info.py. 1555 1556 This is maintained along with the actual devserver code. 1557 1558 @param image: the image to fetch and stage. 1559 @param artifacts: A list of artifacts. 1560 @param files: A list of files to stage. 1561 @param archive_url: Optional parameter that has the archive_url to stage 1562 this artifact from. Default is specified in autotest config + 1563 image. 1564 1565 @raise DevServerException upon any return code that's not HTTP OK. 1566 """ 1567 if not artifacts and not files: 1568 raise DevServerException('Must specify something to stage.') 1569 image = self.translate(image) 1570 self._stage_artifacts(image, artifacts, files, archive_url) 1571 1572 1573 @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS) 1574 def list_image_dir(self, image): 1575 """List the contents of the image stage directory, on the devserver. 1576 1577 @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>. 1578 1579 @raise DevServerException upon any return code that's not HTTP OK. 1580 """ 1581 image = self.translate(image) 1582 logging.info('Requesting contents from devserver %s for image %s', 1583 self.url(), image) 1584 archive_url = _get_storage_server_for_artifacts() + image 1585 call = self.build_call('list_image_dir', archive_url=archive_url) 1586 response = self.run_call(call, readline=True) 1587 for line in response: 1588 logging.info(line) 1589 1590 1591 def trigger_download(self, image, synchronous=True): 1592 """Tell the devserver to download and stage |image|. 1593 1594 Tells the devserver to fetch |image| from the image storage server 1595 named by _get_image_storage_server(). 1596 1597 If |synchronous| is True, waits for the entire download to finish 1598 staging before returning. Otherwise only the artifacts necessary 1599 to start installing images onto DUT's will be staged before returning. 1600 A caller can then call finish_download to guarantee the rest of the 1601 artifacts have finished staging. 1602 1603 @param image: the image to fetch and stage. 1604 @param synchronous: if True, waits until all components of the image are 1605 staged before returning. 1606 1607 @raise DevServerException upon any return code that's not HTTP OK. 1608 1609 """ 1610 image = self.translate(image) 1611 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE 1612 self._trigger_download(image, artifacts, files='', 1613 synchronous=synchronous) 1614 1615 1616 @remote_devserver_call() 1617 def setup_telemetry(self, build): 1618 """Tell the devserver to setup telemetry for this build. 1619 1620 The devserver will stage autotest and then extract the required files 1621 for telemetry. 1622 1623 @param build: the build to setup telemetry for. 1624 1625 @returns path on the devserver that telemetry is installed to. 1626 """ 1627 build = self.translate(build) 1628 archive_url = _get_image_storage_server() + build 1629 call = self.build_call('setup_telemetry', archive_url=archive_url) 1630 try: 1631 response = self.run_call(call) 1632 except httplib.BadStatusLine as e: 1633 logging.error(e) 1634 raise DevServerException('Received Bad Status line, Devserver %s ' 1635 'might have gone down while handling ' 1636 'the call: %s' % (self.url(), call)) 1637 return response 1638 1639 1640 def finish_download(self, image): 1641 """Tell the devserver to finish staging |image|. 1642 1643 If trigger_download is called with synchronous=False, it will return 1644 before all artifacts have been staged. This method contacts the 1645 devserver and blocks until all staging is completed and should be 1646 called after a call to trigger_download. 1647 1648 @param image: the image to fetch and stage. 1649 @raise DevServerException upon any return code that's not HTTP OK. 1650 """ 1651 image = self.translate(image) 1652 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST 1653 self._finish_download(image, artifacts, files='') 1654 1655 1656 def get_update_url(self, image): 1657 """Returns the url that should be passed to the updater. 1658 1659 @param image: the image that was fetched. 1660 """ 1661 image = self.translate(image) 1662 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 1663 type=str) 1664 return (url_pattern % (self.url(), image)) 1665 1666 1667 def get_staged_file_url(self, filename, image): 1668 """Returns the url of a staged file for this image on the devserver.""" 1669 return '/'.join([self._get_image_url(image), filename]) 1670 1671 1672 def get_full_payload_url(self, image): 1673 """Returns a URL to a staged full payload. 1674 1675 @param image: the image that was fetched. 1676 1677 @return A fully qualified URL that can be used for downloading the 1678 payload. 1679 1680 """ 1681 return self._get_image_url(image) + '/update.gz' 1682 1683 1684 def get_test_image_url(self, image): 1685 """Returns a URL to a staged test image. 1686 1687 @param image: the image that was fetched. 1688 1689 @return A fully qualified URL that can be used for downloading the 1690 image. 1691 1692 """ 1693 return self._get_image_url(image) + '/chromiumos_test_image.bin' 1694 1695 1696 def get_recovery_image_url(self, image): 1697 """Returns a URL to a staged recovery image. 1698 1699 @param image: the image that was fetched. 1700 1701 @return A fully qualified URL that can be used for downloading the 1702 image. 1703 1704 """ 1705 return self._get_image_url(image) + '/recovery_image.bin' 1706 1707 1708 @remote_devserver_call() 1709 def get_dependencies_file(self, build): 1710 """Ask the dev server for the contents of the suite dependencies file. 1711 1712 Ask the dev server at |self._dev_server| for the contents of the 1713 pre-processed suite dependencies file (at DEPENDENCIES_FILE) 1714 for |build|. 1715 1716 @param build: The build (e.g. x86-mario-release/R21-2333.0.0) 1717 whose dependencies the caller is interested in. 1718 @return The contents of the dependencies file, which should eval to 1719 a dict of dicts, as per bin_utils/suite_preprocessor.py. 1720 @raise DevServerException upon any return code that's not HTTP OK. 1721 """ 1722 build = self.translate(build) 1723 call = self.build_call('controlfiles', 1724 build=build, control_path=DEPENDENCIES_FILE) 1725 return self.run_call(call) 1726 1727 1728 @remote_devserver_call() 1729 def get_latest_build_in_gs(self, board): 1730 """Ask the devservers for the latest offical build in Google Storage. 1731 1732 @param board: The board for who we want the latest official build. 1733 @return A string of the returned build rambi-release/R37-5868.0.0 1734 @raise DevServerException upon any return code that's not HTTP OK. 1735 """ 1736 call = self.build_call( 1737 'xbuddy_translate/remote/%s/latest-official' % board, 1738 image_dir=_get_image_storage_server()) 1739 image_name = self.run_call(call) 1740 return os.path.dirname(image_name) 1741 1742 1743 def translate(self, build_name): 1744 """Translate the build name if it's in LATEST format. 1745 1746 If the build name is in the format [builder]/LATEST, return the latest 1747 build in Google Storage otherwise return the build name as is. 1748 1749 @param build_name: build_name to check. 1750 1751 @return The actual build name to use. 1752 """ 1753 match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I) 1754 if not match: 1755 return build_name 1756 translated_build = self.get_latest_build_in_gs(match.groups()[0]) 1757 logging.debug('Translated relative build %s to %s', build_name, 1758 translated_build) 1759 return translated_build 1760 1761 1762 @classmethod 1763 @remote_devserver_call() 1764 def get_latest_build(cls, target, milestone=''): 1765 """Ask all the devservers for the latest build for a given target. 1766 1767 @param target: The build target, typically a combination of the board 1768 and the type of build e.g. x86-mario-release. 1769 @param milestone: For latest build set to '', for builds only in a 1770 specific milestone set to a str of format Rxx 1771 (e.g. R16). Default: ''. Since we are dealing with a 1772 webserver sending an empty string, '', ensures that 1773 the variable in the URL is ignored as if it was set 1774 to None. 1775 @return A string of the returned build e.g. R20-2226.0.0. 1776 @raise DevServerException upon any return code that's not HTTP OK. 1777 """ 1778 calls = cls.build_all_calls('latestbuild', target=target, 1779 milestone=milestone) 1780 latest_builds = [] 1781 for call in calls: 1782 latest_builds.append(cls.run_call(call)) 1783 1784 return max(latest_builds, key=version.LooseVersion) 1785 1786 1787 @remote_devserver_call() 1788 def _kill_au_process_for_host(self, **kwargs): 1789 """Kill the triggerred auto_update process if error happens in cros_au. 1790 1791 @param kwargs: Arguments to make kill_au_proc devserver call. 1792 """ 1793 call = self.build_call('kill_au_proc', **kwargs) 1794 response = self.run_call(call) 1795 if not response == 'True': 1796 raise DevServerException( 1797 'Failed to kill the triggerred CrOS auto_update process' 1798 'on devserver %s, the response is %s' % ( 1799 self.url(), response)) 1800 1801 1802 def kill_au_process_for_host(self, host_name, pid): 1803 """Kill the triggerred auto_update process if error happens. 1804 1805 Usually this function is used to clear all potential left au processes 1806 of the given host name. 1807 1808 If pid is specified, the devserver will further check the given pid to 1809 make sure the process is killed. This is used for the case that the au 1810 process has started in background, but then provision fails due to 1811 some unknown issues very fast. In this case, when 'kill_au_proc' is 1812 called, there's no corresponding background track log created for this 1813 ongoing au process, which prevents this RPC call from killing this au 1814 process. 1815 1816 @param host_name: The DUT's hostname. 1817 @param pid: The ongoing au process's pid. 1818 1819 @return: True if successfully kill the auto-update process for host. 1820 """ 1821 kwargs = {'host_name': host_name, 'pid': pid} 1822 try: 1823 self._kill_au_process_for_host(**kwargs) 1824 except DevServerException: 1825 return False 1826 1827 return True 1828 1829 1830 @remote_devserver_call() 1831 def _clean_track_log(self, **kwargs): 1832 """Clean track log for the current auto-update process.""" 1833 call = self.build_call('handler_cleanup', **kwargs) 1834 self.run_call(call) 1835 1836 1837 def clean_track_log(self, host_name, pid): 1838 """Clean track log for the current auto-update process. 1839 1840 @param host_name: The host name to be updated. 1841 @param pid: The auto-update process id. 1842 1843 @return: True if track log is successfully cleaned, False otherwise. 1844 """ 1845 if not pid: 1846 return False 1847 1848 kwargs = {'host_name': host_name, 'pid': pid} 1849 try: 1850 self._clean_track_log(**kwargs) 1851 except DevServerException as e: 1852 logging.debug('Failed to clean track_status_file on ' 1853 'devserver for host %s and process id %s: %s', 1854 host_name, pid, str(e)) 1855 return False 1856 1857 return True 1858 1859 1860 def _get_au_log_filename(self, log_dir, host_name, pid): 1861 """Return the auto-update log's filename.""" 1862 return os.path.join(log_dir, CROS_AU_LOG_FILENAME % ( 1863 host_name, pid)) 1864 1865 def _read_json_response_from_devserver(self, response): 1866 """Reads the json response from the devserver. 1867 1868 This is extracted to its own function so that it can be easily mocked. 1869 @param response: the response for a devserver. 1870 """ 1871 try: 1872 return json.loads(response) 1873 except ValueError as e: 1874 logging.debug('Failed to load json response: %s', response) 1875 raise DevServerException(e) 1876 1877 1878 @remote_devserver_call() 1879 def _collect_au_log(self, log_dir, **kwargs): 1880 """Collect logs from devserver after cros-update process is finished. 1881 1882 Collect the logs that recording the whole cros-update process, and 1883 write it to sysinfo path of a job. 1884 1885 The example log file name that is stored is like: 1886 '1220-repair/sysinfo/CrOS_update_host_name_pid.log' 1887 1888 @param host_name: the DUT's hostname. 1889 @param pid: the auto-update process id on devserver. 1890 @param log_dir: The directory to save the cros-update process log 1891 retrieved from devserver. 1892 """ 1893 call = self.build_call('collect_cros_au_log', **kwargs) 1894 response = self.run_call(call) 1895 if not os.path.exists(log_dir): 1896 os.mkdir(log_dir) 1897 write_file = self._get_au_log_filename( 1898 log_dir, kwargs['host_name'], kwargs['pid']) 1899 logging.debug('Saving auto-update logs into %s', write_file) 1900 1901 au_logs = self._read_json_response_from_devserver(response) 1902 1903 try: 1904 for k, v in au_logs['host_logs'].items(): 1905 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid']) 1906 log_path = os.path.join(log_dir, log_name) 1907 with open(log_path, 'w') as out_log: 1908 out_log.write(v) 1909 except IOError as e: 1910 raise DevServerException('Failed to write auto-update hostlogs: ' 1911 '%s' % e) 1912 1913 try: 1914 with open(write_file, 'w') as out_log: 1915 out_log.write(au_logs['cros_au_log']) 1916 except: 1917 raise DevServerException('Failed to write auto-update logs into ' 1918 '%s' % write_file) 1919 1920 1921 def collect_au_log(self, host_name, pid, log_dir): 1922 """Collect logs from devserver after cros-update process is finished. 1923 1924 @param host_name: the DUT's hostname. 1925 @param pid: the auto-update process id on devserver. 1926 @param log_dir: The directory to save the cros-update process log 1927 retrieved from devserver. 1928 1929 @return: True if auto-update log is successfully collected, False 1930 otherwise. 1931 """ 1932 if not pid: 1933 return False 1934 1935 kwargs = {'host_name': host_name, 'pid': pid} 1936 try: 1937 self._collect_au_log(log_dir, **kwargs) 1938 except DevServerException as e: 1939 logging.debug('Failed to collect auto-update log on ' 1940 'devserver for host %s and process id %s: %s', 1941 host_name, pid, str(e)) 1942 return False 1943 1944 return True 1945 1946 1947 @remote_devserver_call() 1948 def _trigger_auto_update(self, **kwargs): 1949 """Trigger auto-update by calling devserver.cros_au. 1950 1951 @param kwargs: Arguments to make cros_au devserver call. 1952 1953 @return: a tuple indicates whether the RPC call cros_au succeeds and 1954 the auto-update process id running on devserver. 1955 """ 1956 host_name = kwargs['host_name'] 1957 call = self.build_call('cros_au', async=True, **kwargs) 1958 try: 1959 response = self.run_call(call) 1960 logging.info( 1961 'Received response from devserver for cros_au call: %r', 1962 response) 1963 except httplib.BadStatusLine as e: 1964 logging.error(e) 1965 raise DevServerException('Received Bad Status line, Devserver %s ' 1966 'might have gone down while handling ' 1967 'the call: %s' % (self.url(), call)) 1968 1969 return response 1970 1971 1972 def _check_for_auto_update_finished(self, pid, wait=True, **kwargs): 1973 """Polling devserver.get_au_status to get current auto-update status. 1974 1975 The current auto-update status is used to identify whether the update 1976 process is finished. 1977 1978 @param pid: The background process id for auto-update in devserver. 1979 @param kwargs: keyword arguments to make get_au_status devserver call. 1980 @param wait: Should the check wait for completion. 1981 1982 @return: True if auto-update is finished for a given dut. 1983 """ 1984 logging.debug('Check the progress for auto-update process %r', pid) 1985 kwargs['pid'] = pid 1986 call = self.build_call('get_au_status', **kwargs) 1987 1988 def all_finished(): 1989 """Call devserver.get_au_status rpc to check if auto-update 1990 is finished. 1991 1992 @return: True if auto-update is finished for a given dut. False 1993 otherwise. 1994 @rasies DevServerException, the exception is a wrapper of all 1995 exceptions that were raised when devserver tried to 1996 download the artifacts. devserver raises an HTTPError or 1997 a CmdError when an exception was raised in the code. Such 1998 exception should be re-raised here to stop the caller from 1999 waiting. If the call to devserver failed for connection 2000 issue, a URLError exception is raised, and caller should 2001 retry the call to avoid such network flakiness. 2002 2003 """ 2004 try: 2005 au_status = self.run_call(call) 2006 response = json.loads(au_status) 2007 # This is a temp fix to fit both dict and tuple returning 2008 # values. The dict check will be removed after a corresponding 2009 # devserver CL is deployed. 2010 if isinstance(response, dict): 2011 if response.get('detailed_error_msg'): 2012 raise DevServerException( 2013 response.get('detailed_error_msg')) 2014 2015 if response.get('finished'): 2016 logging.debug('CrOS auto-update is finished') 2017 return True 2018 else: 2019 logging.debug('Current CrOS auto-update status: %s', 2020 response.get('status')) 2021 return False 2022 2023 if not response[0]: 2024 logging.debug('Current CrOS auto-update status: %s', 2025 response[1]) 2026 return False 2027 else: 2028 logging.debug('CrOS auto-update is finished') 2029 return True 2030 except urllib2.HTTPError as e: 2031 error_markup = e.read() 2032 raise DevServerException(_strip_http_message(error_markup)) 2033 except urllib2.URLError as e: 2034 # Could be connection issue, retry it. 2035 # For example: <urlopen error [Errno 111] Connection refused> 2036 logging.warning('URLError (%r): Retrying connection to ' 2037 'devserver to check auto-update status.', e) 2038 return False 2039 except error.CmdError: 2040 # Retry if SSH failed to connect to the devserver. 2041 logging.warning('CmdError: Retrying SSH connection to check ' 2042 'auto-update status.') 2043 return False 2044 except socket.error as e: 2045 # Could be some temporary devserver connection issues. 2046 logging.warning('Socket Error (%r): Retrying connection to ' 2047 'devserver to check auto-update status.', e) 2048 return False 2049 except ValueError as e: 2050 raise DevServerException( 2051 '%s (Got AU status: %r)' % (str(e), au_status)) 2052 2053 if wait: 2054 bin_utils.poll_for_condition( 2055 all_finished, 2056 exception=bin_utils.TimeoutError(), 2057 timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60, 2058 sleep_interval=CROS_AU_POLLING_INTERVAL) 2059 2060 return True 2061 else: 2062 return all_finished() 2063 2064 2065 def check_for_auto_update_finished(self, response, wait=True, **kwargs): 2066 """Processing response of 'cros_au' and polling for auto-update status. 2067 2068 Will wait for the whole auto-update process is finished. 2069 2070 @param response: The response from RPC 'cros_au' 2071 @param kwargs: keyword arguments to make get_au_status devserver call. 2072 2073 @return: a tuple includes two elements. 2074 finished: True if the operation has completed. 2075 raised_error: None if everything works well or the raised error. 2076 pid: the auto-update process id on devserver. 2077 """ 2078 2079 pid = 0 2080 raised_error = None 2081 finished = False 2082 try: 2083 response = json.loads(response) 2084 if response[0]: 2085 pid = response[1] 2086 # If provision is kicked off asynchronously, pid will be -1. 2087 # If provision is not successfully kicked off , pid continues 2088 # to be 0. 2089 if pid > 0: 2090 logging.debug('start process %r for auto_update in ' 2091 'devserver', pid) 2092 finished = self._check_for_auto_update_finished( 2093 pid, wait=wait, **kwargs) 2094 except Exception as e: 2095 logging.debug('Failed to trigger auto-update process on devserver') 2096 finished = True 2097 raised_error = e 2098 finally: 2099 return finished, raised_error, pid 2100 2101 2102 def _check_error_message(self, error_patterns_to_check, error_msg): 2103 """Detect whether specific error pattern exist in error message. 2104 2105 @param error_patterns_to_check: the error patterns to check 2106 @param error_msg: the error message which may include any error 2107 pattern. 2108 2109 @return A boolean variable, True if error_msg contains any error 2110 pattern in error_patterns_to_check, False otherwise. 2111 """ 2112 for err in error_patterns_to_check: 2113 if err in error_msg: 2114 return True 2115 2116 return False 2117 2118 2119 def _is_retryable(self, error_msg): 2120 """Detect whether we will retry auto-update based on error_msg. 2121 2122 @param error_msg: The given error message. 2123 2124 @return A boolean variable which indicates whether we will retry 2125 auto_update with another devserver based on the given error_msg. 2126 """ 2127 # For now we just hard-code the error message we think it's suspicious. 2128 # When we get more date about what's the json response when devserver 2129 # is overloaded, we can update this part. 2130 retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE, 2131 'is not pingable'] 2132 return self._check_error_message(retryable_error_patterns, error_msg) 2133 2134 2135 def _should_use_original_payload(self, error_msg): 2136 devserver_error_patterns = ['DevserverCannotStartError'] 2137 return self._check_error_message(devserver_error_patterns, error_msg) 2138 2139 2140 def _parse_buildname_safely(self, build_name): 2141 """Parse a given buildname safely. 2142 2143 @param build_name: the build name to be parsed. 2144 2145 @return: a tuple (board, build_type, milestone) 2146 """ 2147 try: 2148 board, build_type, milestone, _ = server_utils.ParseBuildName( 2149 build_name) 2150 except server_utils.ParseBuildNameException: 2151 logging.warning('Unable to parse build name %s for metrics. ' 2152 'Continuing anyway.', build_name) 2153 board, build_type, milestone = ('', '', '') 2154 2155 return board, build_type, milestone 2156 2157 2158 def _emit_auto_update_metrics(self, board, build_type, dut_host_name, 2159 build_name, attempt, 2160 success, failure_reason, duration): 2161 """Send metrics for a single auto_update attempt. 2162 2163 @param board: a field in metrics representing which board this 2164 auto_update tries to update. 2165 @param build_type: a field in metrics representing which build type this 2166 auto_update tries to update. 2167 @param dut_host_name: a field in metrics representing which DUT this 2168 auto_update tries to update. 2169 @param build_name: auto update build being updated to. 2170 @param attempt: a field in metrics, representing which attempt/retry 2171 this auto_update is. 2172 @param success: a field in metrics, representing whether this 2173 auto_update succeeds or not. 2174 @param failure_reason: DevServerExceptionClassifier object to show 2175 auto update failure reason, or None. 2176 @param duration: auto update duration time, in seconds. 2177 """ 2178 # The following is high cardinality, but sparse. 2179 # Each DUT is of a single board type, and likely build type. 2180 # The affinity also results in each DUT being attached to the same 2181 # dev_server as well. 2182 fields = { 2183 'board': board, 2184 'build_type': build_type, 2185 'dut_host_name': dut_host_name, 2186 'dev_server': self.resolved_hostname, 2187 'attempt': attempt, 2188 'success': success, 2189 } 2190 2191 # reset_after=True is required for String gauges events to ensure that 2192 # the metrics are not repeatedly emitted until the server restarts. 2193 2194 metrics.String(PROVISION_PATH + '/auto_update_build_by_devserver_dut', 2195 reset_after=True).set(build_name, fields=fields) 2196 2197 if not success: 2198 metrics.String( 2199 PROVISION_PATH + 2200 '/auto_update_failure_reason_by_devserver_dut', 2201 reset_after=True).set( 2202 failure_reason.classification if failure_reason else '', 2203 fields=fields) 2204 2205 metrics.SecondsDistribution( 2206 PROVISION_PATH + '/auto_update_duration_by_devserver_dut').add( 2207 duration, fields=fields) 2208 2209 2210 def _emit_provision_metrics(self, error_list, duration_list, 2211 is_au_success, board, build_type, milestone, 2212 dut_host_name, is_aue2etest, 2213 total_duration, build_name): 2214 """Send metrics for provision request. 2215 2216 Provision represents potentially multiple auto update attempts. 2217 2218 Please note: to avoid reaching or exceeding the monarch field 2219 cardinality limit, we avoid a metric that includes both dut hostname 2220 and other high cardinality fields. 2221 2222 @param error_list: a list of DevServerExceptionClassifier objects to 2223 show errors happened in provision. Usually it contains 1 ~ 2224 AU_RETRY_LIMIT objects since we only retry provision for several 2225 times. 2226 @param duration_list: a list of provision duration time, counted by 2227 seconds. 2228 @param is_au_success: a field in metrics, representing whether this 2229 auto_update succeeds or not. 2230 @param board: a field in metrics representing which board this 2231 auto_update tries to update. 2232 @param build_type: a field in metrics representing which build type this 2233 auto_update tries to update. 2234 @param milestone: a field in metrics representing which milestone this 2235 auto_update tries to update. 2236 @param dut_host_name: a field in metrics representing which DUT this 2237 auto_update tries to update. 2238 @param is_aue2etest: a field in metrics representing if provision was 2239 done as part of the autoupdate_EndToEndTest. 2240 """ 2241 # The following is high cardinality, but sparse. 2242 # Each DUT is of a single board type, and likely build type. 2243 # The affinity also results in each DUT being attached to the same 2244 # dev_server as well. 2245 fields = { 2246 'board': board, 2247 'build_type': build_type, 2248 'dut_host_name': dut_host_name, 2249 'dev_server': self.resolved_hostname, 2250 'success': is_au_success, 2251 } 2252 2253 # reset_after=True is required for String gauges events to ensure that 2254 # the metrics are not repeatedly emitted until the server restarts. 2255 2256 metrics.String(PROVISION_PATH + '/provision_build_by_devserver_dut', 2257 reset_after=True).set(build_name, fields=fields) 2258 2259 if error_list: 2260 metrics.String( 2261 PROVISION_PATH + 2262 '/provision_failure_reason_by_devserver_dut', 2263 reset_after=True).set(error_list[0].classification, 2264 fields=fields) 2265 2266 metrics.SecondsDistribution( 2267 PROVISION_PATH + '/provision_duration_by_devserver_dut').add( 2268 total_duration, fields=fields) 2269 2270 2271 def _parse_buildname_from_gs_uri(self, uri): 2272 """Get parameters needed for AU metrics when build_name is not known. 2273 2274 autoupdate_EndToEndTest is run with two Google Storage URIs from the 2275 gs://chromeos-releases bucket. URIs in this bucket do not have the 2276 build_name in the format samus-release/R60-0000.0.0. 2277 2278 We can get the milestone and board by checking the instructions.json 2279 file contained in the bucket with the payloads. 2280 2281 @param uri: The partial uri we received from autoupdate_EndToEndTest. 2282 """ 2283 try: 2284 # Get the instructions file that contains info about the build. 2285 gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json' 2286 files = bin_utils.gs_ls(gs_file) 2287 for f in files: 2288 gs_folder, _, instruction_file = f.rpartition('/') 2289 self.stage_artifacts(image=uri, 2290 files=[instruction_file], 2291 archive_url=gs_folder) 2292 json_file = self.get_staged_file_url(instruction_file, uri) 2293 response = urllib2.urlopen(json_file) 2294 data = json.load(response) 2295 return data['board'], 'release', data['version']['milestone'] 2296 except (ValueError, error.CmdError, urllib2.URLError) as e: 2297 logging.debug('Problem getting values for metrics: %s', e) 2298 logging.warning('Unable to parse build name %s from AU test for ' 2299 'metrics. Continuing anyway.', uri) 2300 2301 return '', '', '' 2302 2303 2304 def auto_update(self, host_name, build_name, original_board=None, 2305 original_release_version=None, log_dir=None, 2306 force_update=False, full_update=False, 2307 payload_filename=None, force_original=False, 2308 clobber_stateful=True, quick_provision=False): 2309 """Auto-update a CrOS host. 2310 2311 @param host_name: The hostname of the DUT to auto-update. 2312 @param build_name: The build name to be auto-updated on the DUT. 2313 @param original_board: The original board of the DUT to auto-update. 2314 @param original_release_version: The release version of the DUT's 2315 current build. 2316 @param log_dir: The log directory to store auto-update logs from 2317 devserver. 2318 @param force_update: Force an update even if the version installed 2319 is the same. Default: False. 2320 @param full_update: If True, do not run stateful update, directly 2321 force a full reimage. If False, try stateful 2322 update first if the dut is already installed 2323 with the same version. 2324 @param payload_filename: Used to specify the exact file to 2325 use for autoupdating. If None, the payload 2326 will be determined by build_name. You 2327 must have already staged this file before 2328 passing it in here. 2329 @param force_original: Whether to force stateful update with the 2330 original payload. 2331 @param clobber_stateful: If True do a clean install of stateful. 2332 @param quick_provision: Attempt to use quick provision path first. 2333 2334 @return A set (is_success, pid) in which: 2335 1. is_success indicates whether this auto_update succeeds. 2336 2. pid is the process id of the successful autoupdate run. 2337 2338 @raise DevServerException if auto_update fails and is not retryable. 2339 @raise RetryableProvisionException if it fails and is retryable. 2340 """ 2341 kwargs = {'host_name': host_name, 2342 'build_name': build_name, 2343 'force_update': force_update, 2344 'full_update': full_update, 2345 'clobber_stateful': clobber_stateful, 2346 'quick_provision': quick_provision} 2347 2348 is_aue2etest = payload_filename is not None 2349 2350 if is_aue2etest: 2351 kwargs['payload_filename'] = payload_filename 2352 2353 error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s' 2354 is_au_success = False 2355 au_log_dir = os.path.join(log_dir, 2356 AUTO_UPDATE_LOG_DIR) if log_dir else None 2357 error_list = [] 2358 retry_with_another_devserver = False 2359 duration_list = [] 2360 2361 if is_aue2etest: 2362 board, build_type, milestone = self._parse_buildname_from_gs_uri( 2363 build_name) 2364 else: 2365 board, build_type, milestone = self._parse_buildname_safely( 2366 build_name) 2367 2368 provision_start_time = time.time() 2369 for au_attempt in range(AU_RETRY_LIMIT): 2370 logging.debug('Start CrOS auto-update for host %s at %d time(s).', 2371 host_name, au_attempt + 1) 2372 au_start_time = time.time() 2373 failure_reason = None 2374 # No matter _trigger_auto_update succeeds or fails, the auto-update 2375 # track_status_file should be cleaned, and the auto-update execute 2376 # log should be collected to directory sysinfo. Also, the error 2377 # raised by _trigger_auto_update should be displayed. 2378 try: 2379 # Try update with stateful.tgz of old release version in the 2380 # last try of auto-update. 2381 if force_original and original_release_version: 2382 # Monitor this case in monarch 2383 original_build = '%s/%s' % (original_board, 2384 original_release_version) 2385 c = metrics.Counter( 2386 'chromeos/autotest/provision/' 2387 'cros_update_with_original_build') 2388 f = {'dev_server': self.resolved_hostname, 2389 'board': board, 2390 'build_type': build_type, 2391 'milestone': milestone, 2392 'original_build': original_build} 2393 c.increment(fields=f) 2394 2395 logging.debug('Try updating stateful partition of the ' 2396 'host with the same version of its current ' 2397 'rootfs partition: %s', original_build) 2398 response = self._trigger_auto_update( 2399 original_build=original_build, **kwargs) 2400 else: 2401 response = self._trigger_auto_update(**kwargs) 2402 except DevServerException as e: 2403 logging.debug(error_msg_attempt, au_attempt+1, str(e)) 2404 failure_reason = DevServerExceptionClassifier(str(e)) 2405 else: 2406 _, raised_error, pid = self.check_for_auto_update_finished( 2407 response, **kwargs) 2408 2409 # Error happens in _collect_au_log won't be raised. 2410 if au_log_dir: 2411 is_collect_success = self.collect_au_log( 2412 kwargs['host_name'], pid, au_log_dir) 2413 else: 2414 is_collect_success = True 2415 2416 # Error happens in _clean_track_log won't be raised. 2417 if pid >= 0: 2418 is_clean_success = self.clean_track_log( 2419 kwargs['host_name'], pid) 2420 else: 2421 is_clean_success = True 2422 2423 # If any error is raised previously, log it and retry 2424 # auto-update. Otherwise, claim a successful CrOS auto-update. 2425 if (not raised_error and is_clean_success and 2426 is_collect_success): 2427 logging.debug('CrOS auto-update succeed for host %s', 2428 host_name) 2429 is_au_success = True 2430 break 2431 else: 2432 if not self.kill_au_process_for_host(kwargs['host_name'], 2433 pid): 2434 logging.debug('Failed to kill auto_update process %d', 2435 pid) 2436 if raised_error: 2437 error_str = str(raised_error) 2438 logging.debug(error_msg_attempt, au_attempt + 1, 2439 error_str) 2440 if au_log_dir: 2441 logging.debug('Please see error details in log %s', 2442 self._get_au_log_filename( 2443 au_log_dir, 2444 kwargs['host_name'], 2445 pid)) 2446 failure_reason = DevServerExceptionClassifier( 2447 error_str, keep_full_trace=False) 2448 if self._is_retryable(error_str): 2449 retry_with_another_devserver = True 2450 2451 if self._should_use_original_payload(error_str): 2452 force_original = True 2453 2454 finally: 2455 duration = int(time.time() - au_start_time) 2456 duration_list.append(duration) 2457 if failure_reason: 2458 error_list.append(failure_reason) 2459 self._emit_auto_update_metrics(board, build_type, host_name, 2460 build_name, au_attempt + 1, 2461 is_au_success, failure_reason, 2462 duration) 2463 if retry_with_another_devserver: 2464 break 2465 2466 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1: 2467 time.sleep(CROS_AU_RETRY_INTERVAL) 2468 # Use the IP of DUT if the hostname failed. 2469 host_name_ip = socket.gethostbyname(host_name) 2470 kwargs['host_name'] = host_name_ip 2471 logging.debug( 2472 'AU failed, trying IP instead of hostname: %s', 2473 host_name_ip) 2474 2475 total_duration = int(time.time() - provision_start_time) 2476 self._emit_provision_metrics(error_list, duration_list, is_au_success, 2477 board, build_type, milestone, host_name, 2478 is_aue2etest, total_duration, build_name) 2479 2480 if is_au_success: 2481 return (is_au_success, pid) 2482 2483 # If errors happen in the CrOS AU process, report the concatenation 2484 # of the errors happening in first & second provision. 2485 # If error happens in RPCs of cleaning track log, collecting 2486 # auto-update logs, or killing auto-update processes, just report a 2487 # common error here. 2488 if error_list: 2489 real_error = ', '.join(['%d) %s' % (i, e.summary) 2490 for i, e in enumerate(error_list)]) 2491 if retry_with_another_devserver: 2492 raise RetryableProvisionException(real_error) 2493 else: 2494 raise error_list[0].classified_exception(real_error) 2495 else: 2496 raise DevServerException('RPC calls after the whole auto-update ' 2497 'process failed.') 2498 2499 2500class AndroidBuildServer(ImageServerBase): 2501 """Class for DevServer that handles RPCs related to Android builds. 2502 2503 The calls to devserver to stage artifacts, including stage and download, are 2504 made in async mode. That is, when caller makes an RPC |stage| to request 2505 devserver to stage certain artifacts, devserver handles the call and starts 2506 staging artifacts in a new thread, and return |Success| without waiting for 2507 staging being completed. When caller receives message |Success|, it polls 2508 devserver's is_staged call until all artifacts are staged. 2509 Such mechanism is designed to prevent cherrypy threads in devserver being 2510 running out, as staging artifacts might take long time, and cherrypy starts 2511 with a fixed number of threads that handle devserver rpc. 2512 """ 2513 2514 def wait_for_artifacts_staged(self, target, build_id, branch, 2515 archive_url=None, artifacts='', files=''): 2516 """Polling devserver.is_staged until all artifacts are staged. 2517 2518 @param target: Target of the android build to stage, e.g., 2519 shamu-userdebug. 2520 @param build_id: Build id of the android build to stage. 2521 @param branch: Branch of the android build to stage. 2522 @param archive_url: Google Storage URL for the build. 2523 @param artifacts: Comma separated list of artifacts to download. 2524 @param files: Comma separated list of files to download. 2525 2526 @return: True if all artifacts are staged in devserver. 2527 """ 2528 kwargs = {'target': target, 2529 'build_id': build_id, 2530 'branch': branch, 2531 'artifacts': artifacts, 2532 'files': files, 2533 'os_type': 'android'} 2534 if archive_url: 2535 kwargs['archive_url'] = archive_url 2536 return self._poll_is_staged(**kwargs) 2537 2538 2539 @remote_devserver_call() 2540 def call_and_wait(self, call_name, target, build_id, branch, archive_url, 2541 artifacts, files, error_message, 2542 expected_response=SUCCESS): 2543 """Helper method to make a urlopen call, and wait for artifacts staged. 2544 2545 @param call_name: name of devserver rpc call. 2546 @param target: Target of the android build to stage, e.g., 2547 shamu-userdebug. 2548 @param build_id: Build id of the android build to stage. 2549 @param branch: Branch of the android build to stage. 2550 @param archive_url: Google Storage URL for the CrOS build. 2551 @param artifacts: Comma separated list of artifacts to download. 2552 @param files: Comma separated list of files to download. 2553 @param expected_response: Expected response from rpc, default to 2554 |Success|. If it's set to None, do not compare 2555 the actual response. Any response is consider 2556 to be good. 2557 @param error_message: Error message to be thrown if response does not 2558 match expected_response. 2559 2560 @return: The response from rpc. 2561 @raise DevServerException upon any return code that's expected_response. 2562 2563 """ 2564 kwargs = {'target': target, 2565 'build_id': build_id, 2566 'branch': branch, 2567 'artifacts': artifacts, 2568 'files': files, 2569 'os_type': 'android'} 2570 if archive_url: 2571 kwargs['archive_url'] = archive_url 2572 return self._call_and_wait(call_name, error_message, expected_response, 2573 **kwargs) 2574 2575 2576 @remote_devserver_call() 2577 def stage_artifacts(self, target=None, build_id=None, branch=None, 2578 image=None, artifacts=None, files='', archive_url=None): 2579 """Tell the devserver to download and stage |artifacts| from |image|. 2580 2581 This is the main call point for staging any specific artifacts for a 2582 given build. To see the list of artifacts one can stage see: 2583 2584 ~src/platfrom/dev/artifact_info.py. 2585 2586 This is maintained along with the actual devserver code. 2587 2588 @param target: Target of the android build to stage, e.g., 2589 shamu-userdebug. 2590 @param build_id: Build id of the android build to stage. 2591 @param branch: Branch of the android build to stage. 2592 @param image: Name of a build to test, in the format of 2593 branch/target/build_id 2594 @param artifacts: A list of artifacts. 2595 @param files: A list of files to stage. 2596 @param archive_url: Optional parameter that has the archive_url to stage 2597 this artifact from. Default is specified in autotest config + 2598 image. 2599 2600 @raise DevServerException upon any return code that's not HTTP OK. 2601 """ 2602 if image and not target and not build_id and not branch: 2603 branch, target, build_id = utils.parse_launch_control_build(image) 2604 if not target or not build_id or not branch: 2605 raise DevServerException('Must specify all build info (target, ' 2606 'build_id and branch) to stage.') 2607 2608 android_build_info = {'target': target, 2609 'build_id': build_id, 2610 'branch': branch} 2611 if not artifacts and not files: 2612 raise DevServerException('Must specify something to stage.') 2613 if not all(android_build_info.values()): 2614 raise DevServerException( 2615 'To stage an Android build, must specify target, build id ' 2616 'and branch.') 2617 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2618 self._stage_artifacts(build, artifacts, files, archive_url, 2619 **android_build_info) 2620 2621 def trigger_download(self, target, build_id, branch, artifacts=None, 2622 files='', os='android', synchronous=True): 2623 """Tell the devserver to download and stage an Android build. 2624 2625 Tells the devserver to fetch an Android build from the image storage 2626 server named by _get_image_storage_server(). 2627 2628 If |synchronous| is True, waits for the entire download to finish 2629 staging before returning. Otherwise only the artifacts necessary 2630 to start installing images onto DUT's will be staged before returning. 2631 A caller can then call finish_download to guarantee the rest of the 2632 artifacts have finished staging. 2633 2634 @param target: Target of the android build to stage, e.g., 2635 shamu-userdebug. 2636 @param build_id: Build id of the android build to stage. 2637 @param branch: Branch of the android build to stage. 2638 @param artifacts: A string of artifacts separated by comma. If None, 2639 use the default artifacts for Android or Brillo build. 2640 @param files: String of file seperated by commas. 2641 @param os: OS artifacts to download (android/brillo). 2642 @param synchronous: if True, waits until all components of the image are 2643 staged before returning. 2644 2645 @raise DevServerException upon any return code that's not HTTP OK. 2646 2647 """ 2648 android_build_info = {'target': target, 2649 'build_id': build_id, 2650 'branch': branch} 2651 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2652 if not artifacts: 2653 board = target.split('-')[0] 2654 artifacts = ( 2655 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2656 board, os)) 2657 self._trigger_download(build, artifacts, files=files, 2658 synchronous=synchronous, **android_build_info) 2659 2660 2661 def finish_download(self, target, build_id, branch, os='android'): 2662 """Tell the devserver to finish staging an Android build. 2663 2664 If trigger_download is called with synchronous=False, it will return 2665 before all artifacts have been staged. This method contacts the 2666 devserver and blocks until all staging is completed and should be 2667 called after a call to trigger_download. 2668 2669 @param target: Target of the android build to stage, e.g., 2670 shamu-userdebug. 2671 @param build_id: Build id of the android build to stage. 2672 @param branch: Branch of the android build to stage. 2673 @param os: OS artifacts to download (android/brillo). 2674 2675 @raise DevServerException upon any return code that's not HTTP OK. 2676 """ 2677 android_build_info = {'target': target, 2678 'build_id': build_id, 2679 'branch': branch} 2680 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2681 board = target.split('-')[0] 2682 artifacts = ( 2683 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2684 board)) 2685 self._finish_download(build, artifacts, files='', **android_build_info) 2686 2687 2688 def get_staged_file_url(self, filename, target, build_id, branch): 2689 """Returns the url of a staged file for this image on the devserver. 2690 2691 @param filename: Name of the file. 2692 @param target: Target of the android build to stage, e.g., 2693 shamu-userdebug. 2694 @param build_id: Build id of the android build to stage. 2695 @param branch: Branch of the android build to stage. 2696 2697 @return: The url of a staged file for this image on the devserver. 2698 """ 2699 android_build_info = {'target': target, 2700 'build_id': build_id, 2701 'branch': branch, 2702 'os_type': 'android'} 2703 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2704 return '/'.join([self._get_image_url(build), filename]) 2705 2706 2707 @remote_devserver_call() 2708 def translate(self, build_name): 2709 """Translate the build name if it's in LATEST format. 2710 2711 If the build name is in the format [branch]/[target]/LATEST, return the 2712 latest build in Launch Control otherwise return the build name as is. 2713 2714 @param build_name: build_name to check. 2715 2716 @return The actual build name to use. 2717 """ 2718 branch, target, build_id = utils.parse_launch_control_build(build_name) 2719 if build_id.upper() != 'LATEST': 2720 return build_name 2721 call = self.build_call('latestbuild', branch=branch, target=target, 2722 os_type='android') 2723 translated_build_id = self.run_call(call) 2724 translated_build = (ANDROID_BUILD_NAME_PATTERN % 2725 {'branch': branch, 2726 'target': target, 2727 'build_id': translated_build_id}) 2728 logging.debug('Translated relative build %s to %s', build_name, 2729 translated_build) 2730 return translated_build 2731 2732 2733def _is_load_healthy(load): 2734 """Check if devserver's load meets the minimum threshold. 2735 2736 @param load: The devserver's load stats to check. 2737 2738 @return: True if the load meets the minimum threshold. Return False 2739 otherwise. 2740 2741 """ 2742 # Threshold checks, including CPU load. 2743 if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD: 2744 logging.debug('CPU load of devserver %s is at %s%%, which is higher ' 2745 'than the threshold of %s%%', load['devserver'], 2746 load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD) 2747 return False 2748 if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO: 2749 logging.debug('Network IO of devserver %s is at %i Bps, which is ' 2750 'higher than the threshold of %i bytes per second.', 2751 load['devserver'], load[DevServer.NETWORK_IO], 2752 DevServer.MAX_NETWORK_IO) 2753 return False 2754 return True 2755 2756 2757def _compare_load(devserver1, devserver2): 2758 """Comparator function to compare load between two devservers. 2759 2760 @param devserver1: A dictionary of devserver load stats to be compared. 2761 @param devserver2: A dictionary of devserver load stats to be compared. 2762 2763 @return: Negative value if the load of `devserver1` is less than the load 2764 of `devserver2`. Return positive value otherwise. 2765 2766 """ 2767 return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO]) 2768 2769 2770def _get_subnet_for_host_ip(host_ip, 2771 restricted_subnets=utils.RESTRICTED_SUBNETS): 2772 """Get the subnet for a given host IP. 2773 2774 @param host_ip: the IP of a DUT. 2775 @param restricted_subnets: A list of restriected subnets. 2776 2777 @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the 2778 host_ip, return (None, None). 2779 """ 2780 for subnet_ip, mask_bits in restricted_subnets: 2781 if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits): 2782 return subnet_ip, mask_bits 2783 2784 return None, None 2785 2786 2787def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None): 2788 """Get the devserver with the least load. 2789 2790 Iterate through all devservers and get the one with least load. 2791 2792 TODO(crbug.com/486278): Devserver with required build already staged should 2793 take higher priority. This will need check_health call to be able to verify 2794 existence of a given build/artifact. Also, in case all devservers are 2795 overloaded, the logic here should fall back to the old behavior that randomly 2796 selects a devserver based on the hash of the image name/url. 2797 2798 @param devserver_type: Type of devserver to select from. Default is set to 2799 ImageServer. 2800 @param hostname: Hostname of the dut that the devserver is used for. The 2801 picked devserver needs to respect the location of the host if 2802 `prefer_local_devserver` is set to True or `restricted_subnets` is 2803 set. 2804 2805 @return: Name of the devserver with the least load. 2806 2807 """ 2808 logging.debug('Get the least loaded %r', devserver_type) 2809 devservers, can_retry = devserver_type.get_available_devservers( 2810 hostname) 2811 # If no healthy devservers available and can_retry is False, return None. 2812 # Otherwise, relax the constrain on hostname, allow all devservers to be 2813 # available. 2814 if not devserver_type.get_healthy_devserver('', devservers): 2815 if not can_retry: 2816 return None 2817 else: 2818 devservers, _ = devserver_type.get_available_devservers() 2819 2820 # get_devserver_load call needs to be made in a new process to allow force 2821 # timeout using signal. 2822 output = multiprocessing.Queue() 2823 processes = [] 2824 for devserver in devservers: 2825 processes.append(multiprocessing.Process( 2826 target=devserver_type.get_devserver_load_wrapper, 2827 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output))) 2828 2829 for p in processes: 2830 p.start() 2831 for p in processes: 2832 # The timeout for the process commands aren't reliable. Add 2833 # some extra time to the timeout for potential overhead in the 2834 # subprocesses. crbug.com/913695 2835 p.join(TIMEOUT_GET_DEVSERVER_LOAD + 10) 2836 # Read queue before killing processes to avoid corrupting the queue. 2837 loads = [output.get() for p in processes if not p.is_alive()] 2838 for p in processes: 2839 if p.is_alive(): 2840 p.terminate() 2841 # Filter out any load failed to be retrieved or does not support load check. 2842 loads = [load for load in loads if load and DevServer.CPU_LOAD in load and 2843 DevServer.is_free_disk_ok(load) and 2844 DevServer.is_apache_client_count_ok(load)] 2845 if not loads: 2846 logging.debug('Failed to retrieve load stats from any devserver. No ' 2847 'load balancing can be applied.') 2848 return None 2849 loads = [load for load in loads if _is_load_healthy(load)] 2850 if not loads: 2851 logging.error('No devserver has the capacity to be selected.') 2852 return None 2853 loads = sorted(loads, cmp=_compare_load) 2854 return loads[0]['devserver'] 2855 2856 2857def resolve(build, hostname=None, ban_list=None): 2858 """Resolve a devserver can be used for given build and hostname. 2859 2860 @param build: Name of a build to stage on devserver, e.g., 2861 ChromeOS build: daisy-release/R50-1234.0.0 2862 Launch Control build: git_mnc_release/shamu-eng 2863 @param hostname: Hostname of a devserver for, default is None, which means 2864 devserver is not restricted by the network location of the host. 2865 @param ban_list: The blacklist of devservers shouldn't be chosen. 2866 2867 @return: A DevServer instance that can be used to stage given build for the 2868 given host. 2869 """ 2870 if utils.is_launch_control_build(build): 2871 return AndroidBuildServer.resolve(build, hostname) 2872 else: 2873 return ImageServer.resolve(build, hostname, ban_list=ban_list) 2874