• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from distutils import version
6import cStringIO
7import HTMLParser
8import httplib
9import json
10import logging
11import multiprocessing
12import os
13import re
14import socket
15import time
16import urllib2
17import urlparse
18
19from autotest_lib.client.bin import utils as bin_utils
20from autotest_lib.client.common_lib import android_utils
21from autotest_lib.client.common_lib import error
22from autotest_lib.client.common_lib import global_config
23from autotest_lib.client.common_lib import utils
24from autotest_lib.client.common_lib.cros import retry
25from autotest_lib.server import utils as server_utils
26# TODO(cmasone): redo this class using requests module; http://crosbug.com/30107
27
28try:
29    from chromite.lib import metrics
30except ImportError:
31    metrics = utils.metrics_mock
32
33
34CONFIG = global_config.global_config
35# This file is generated at build time and specifies, per suite and per test,
36# the DEPENDENCIES list specified in each control file.  It's a dict of dicts:
37# {'bvt':   {'/path/to/autotest/control/site_tests/test1/control': ['dep1']}
38#  'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']}
39#  'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'],
40#            '/path/to/autotest/control/site_tests/test3/control': ['dep3']}
41# }
42DEPENDENCIES_FILE = 'test_suites/dependency_info'
43# Number of seconds for caller to poll devserver's is_staged call to check if
44# artifacts are staged.
45_ARTIFACT_STAGE_POLLING_INTERVAL = 5
46# Artifacts that should be staged when client calls devserver RPC to stage an
47# image.
48_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful'
49# Artifacts that should be staged when client calls devserver RPC to stage an
50# image with autotest artifact.
51_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,'
52                                                   'control_files,stateful,'
53                                                   'autotest_packages')
54# Artifacts that should be staged when client calls devserver RPC to stage an
55# Android build.
56_BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions')
57SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value(
58        'CROS', 'skip_devserver_health_check', type=bool)
59# Number of seconds for the call to get devserver load to time out.
60TIMEOUT_GET_DEVSERVER_LOAD = 2.0
61
62# Android artifact path in devserver
63ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value(
64        'CROS', 'android_build_name_pattern', type=str).replace('\\', '')
65
66# Return value from a devserver RPC indicating the call succeeded.
67SUCCESS = 'Success'
68
69# The timeout minutes for a given devserver ssh call.
70DEVSERVER_SSH_TIMEOUT_MINS = 1
71
72# Error message for invalid devserver response.
73ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error'
74
75# Error message for devserver call timedout.
76ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout'
77
78# The timeout minutes for waiting a devserver staging.
79DEVSERVER_IS_STAGING_RETRY_MIN = 100
80
81# The timeout minutes for waiting a DUT auto-update finished.
82DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100
83
84# The total times of devserver triggering CrOS auto-update.
85AU_RETRY_LIMIT = 2
86
87# Number of seconds for caller to poll devserver's get_au_status call to
88# check if cros auto-update is finished.
89CROS_AU_POLLING_INTERVAL = 10
90
91# Number of seconds for intervals between retrying auto-update calls.
92CROS_AU_RETRY_INTERVAL = 20
93
94# The file name for auto-update logs.
95CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log'
96
97# Provision error patterns.
98# People who see this should know that they shouldn't change these
99# classification strings. These strings are used for monitoring provision
100# failures. Any changes may mess up the stats.
101_EXCEPTION_PATTERNS = [
102        # Raised when devserver portfile does not exist on host.
103        (r".*Devserver portfile does not exist!.*$",
104         '(1) Devserver portfile does not exist on host'),
105        # Raised when devserver cannot copy packages to host.
106        (r".*Could not copy .* to device.*$",
107         '(2) Cannot copy packages to host'),
108        # Raised when devserver fails to run specific commands on host.
109        (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$",
110         '(3) Fail to run specific command on host'),
111        # Raised when new build fails to boot on the host.
112        (r'.*RootfsUpdateError: Build .* failed to boot on.*$',
113         '(4) Build failed to boot on host'),
114        # Raised when the auto-update process is timed out.
115        (r'.*The CrOS auto-update process is timed out, '
116         'thus will be terminated.*$',
117         '(5) Auto-update is timed out'),
118        # Raised when the host is not pingable.
119        (r".*DeviceNotPingableError.*$",
120         '(6) Host is not pingable during auto-update'),
121        # Raised when hosts have unexpected status after rootfs update.
122        (r'.*Update failed with unexpected update status: '
123         'UPDATE_STATUS_IDLE.*$',
124         '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs '
125         'update'),
126        # Raised when devserver returns non-json response to shard/drone.
127        (r'.*No JSON object could be decoded.*$',
128         '(8) Devserver returned non-json object'),
129        # Raised when devserver loses host's ssh connection
130        (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$',
131         "(9) Devserver lost host's ssh connection"),
132        # Raised when error happens in writing files to host
133        (r'.*Write failed\: Broken pipe.*$',
134         "(10) Broken pipe while writing or connecting to host")]
135
136PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value(
137        'CROS', 'prefer_local_devserver', type=bool, default=False)
138
139ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value(
140        'CROS', 'enable_ssh_connection_for_devserver', type=bool,
141        default=False)
142
143# Directory to save auto-update logs
144AUTO_UPDATE_LOG_DIR = 'autoupdate_logs'
145
146DEFAULT_SUBNET_MASKBIT = 19
147
148
149class DevServerException(Exception):
150    """Raised when the dev server returns a non-200 HTTP response."""
151    pass
152
153class RetryableProvisionException(DevServerException):
154    """Raised when provision fails due to a retryable reason."""
155    pass
156
157class DevServerOverloadException(Exception):
158    """Raised when the dev server returns a 502 HTTP response."""
159    pass
160
161class DevServerFailToLocateException(Exception):
162    """Raised when fail to locate any devserver."""
163    pass
164
165class MarkupStripper(HTMLParser.HTMLParser):
166    """HTML parser that strips HTML tags, coded characters like &
167
168    Works by, basically, not doing anything for any tags, and only recording
169    the content of text nodes in an internal data structure.
170    """
171    def __init__(self):
172        self.reset()
173        self.fed = []
174
175
176    def handle_data(self, d):
177        """Consume content of text nodes, store it away."""
178        self.fed.append(d)
179
180
181    def get_data(self):
182        """Concatenate and return all stored data."""
183        return ''.join(self.fed)
184
185
186def _strip_http_message(message):
187    """Strip the HTTP marker from the an HTTP message.
188
189    @param message: A string returned by an HTTP call.
190
191    @return: A string with HTTP marker being stripped.
192    """
193    strip = MarkupStripper()
194    try:
195        strip.feed(message.decode('utf_32'))
196    except UnicodeDecodeError:
197        strip.feed(message)
198    return strip.get_data()
199
200
201def _get_image_storage_server():
202    return CONFIG.get_config_value('CROS', 'image_storage_server', type=str)
203
204
205def _get_canary_channel_server():
206    """
207    Get the url of the canary-channel server,
208    eg: gsutil://chromeos-releases/canary-channel/<board>/<release>
209
210    @return: The url to the canary channel server.
211    """
212    return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str)
213
214
215def _get_storage_server_for_artifacts(artifacts=None):
216    """Gets the appropriate storage server for the given artifacts.
217
218    @param artifacts: A list of artifacts we need to stage.
219    @return: The address of the storage server that has these artifacts.
220             The default image storage server if no artifacts are specified.
221    """
222    factory_artifact = global_config.global_config.get_config_value(
223            'CROS', 'factory_artifact', type=str, default='')
224    if artifacts and factory_artifact and factory_artifact in artifacts:
225        return _get_canary_channel_server()
226    return _get_image_storage_server()
227
228
229def _reverse_lookup_from_config(address):
230    """Look up hostname for the given IP address.
231
232    This uses the hostname-address map from the config file.
233
234    If multiple hostnames map to the same IP address, the first one
235    defined in the configuration file takes precedence.
236
237    @param address: IP address string
238    @returns: hostname string, or original input if not found
239    """
240    for hostname, addr in _get_hostname_addr_map().iteritems():
241        if addr == address:
242            return hostname
243    return address
244
245
246def _get_hostname_addr_map():
247    """Get hostname address mapping from config.
248
249    @return: dict mapping server hostnames to addresses
250    """
251    return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP')
252
253
254def _get_dev_server_list():
255    return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[])
256
257
258def _get_crash_server_list():
259    return CONFIG.get_config_value('CROS', 'crash_server', type=list,
260        default=[])
261
262
263def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN,
264                          exception_to_raise=DevServerException):
265    """A decorator to use with remote devserver calls.
266
267    This decorator converts urllib2.HTTPErrors into DevServerExceptions
268    with any embedded error info converted into plain text. The method
269    retries on urllib2.URLError or error.CmdError to avoid devserver flakiness.
270    """
271    #pylint: disable=C0111
272
273    def inner_decorator(method):
274        label = method.__name__ if hasattr(method, '__name__') else None
275        def metrics_wrapper(*args, **kwargs):
276            @retry.retry((urllib2.URLError, error.CmdError,
277                          DevServerOverloadException),
278                         timeout_min=timeout_min,
279                         exception_to_raise=exception_to_raise,
280                        label=label)
281            def wrapper():
282                """This wrapper actually catches the HTTPError."""
283                try:
284                    return method(*args, **kwargs)
285                except urllib2.HTTPError as e:
286                    error_markup = e.read()
287                    raise DevServerException(_strip_http_message(error_markup))
288
289            try:
290                return wrapper()
291            except Exception as e:
292                if ERR_MSG_FOR_TIMED_OUT_CALL in str(e):
293                    dev_server = None
294                    if args and isinstance(args[0], DevServer):
295                        dev_server = args[0].hostname
296                    elif 'devserver' in kwargs:
297                        dev_server = get_hostname(kwargs['devserver'])
298
299                    logging.debug('RPC call %s has timed out on devserver %s.',
300                                  label, dev_server)
301                    c = metrics.Counter(
302                            'chromeos/autotest/devserver/call_timeout')
303                    c.increment(fields={'dev_server': dev_server,
304                                        'healthy': label})
305
306                raise
307
308        return metrics_wrapper
309
310    return inner_decorator
311
312
313def get_hostname(url):
314    """Get the hostname portion of a URL
315
316    schema://hostname:port/path
317
318    @param url: a Url string
319    @return: a hostname string
320    """
321    return urlparse.urlparse(url).hostname
322
323
324class DevServer(object):
325    """Base class for all DevServer-like server stubs.
326
327    This is the base class for interacting with all Dev Server-like servers.
328    A caller should instantiate a sub-class of DevServer with:
329
330    host = SubClassServer.resolve(build)
331    server = SubClassServer(host)
332    """
333    _MIN_FREE_DISK_SPACE_GB = 20
334    _MAX_APACHE_CLIENT_COUNT = 75
335    # Threshold for the CPU load percentage for a devserver to be selected.
336    MAX_CPU_LOAD = 80.0
337    # Threshold for the network IO, set to 80MB/s
338    MAX_NETWORK_IO = 1024 * 1024 * 80
339    DISK_IO = 'disk_total_bytes_per_second'
340    NETWORK_IO = 'network_total_bytes_per_second'
341    CPU_LOAD = 'cpu_percent'
342    FREE_DISK = 'free_disk'
343    AU_PROCESS = 'au_process_count'
344    STAGING_THREAD_COUNT = 'staging_thread_count'
345    APACHE_CLIENT_COUNT = 'apache_client_count'
346
347
348    def __init__(self, devserver):
349        self._devserver = devserver
350
351
352    def url(self):
353        """Returns the url for this devserver."""
354        return self._devserver
355
356
357    @property
358    def hostname(self):
359        """Return devserver hostname parsed from the devserver URL.
360
361        Note that this is likely parsed from the devserver URL from
362        shadow_config.ini, meaning that the "hostname" part of the
363        devserver URL is actually an IP address.
364
365        @return hostname string
366        """
367        return get_hostname(self.url())
368
369
370    @property
371    def resolved_hostname(self):
372        """Return devserver hostname, resolved from its IP address.
373
374        Unlike the hostname property, this property attempts to look up
375        the proper hostname from the devserver IP address.  If lookup
376        fails, then fall back to whatever the hostname property would
377        have returned.
378
379        @return hostname string
380        """
381        return _reverse_lookup_from_config(self.hostname)
382
383
384    @staticmethod
385    def get_server_url(url):
386        """Get the devserver url from a repo url, which includes build info.
387
388        @param url: A job repo url.
389
390        @return A devserver url, e.g., http://127.0.0.10:8080
391        """
392        res = urlparse.urlparse(url)
393        if res.netloc:
394            return res.scheme + '://' + res.netloc
395
396
397    @classmethod
398    def get_devserver_load_wrapper(cls, devserver, timeout_sec, output):
399        """A wrapper function to call get_devserver_load in parallel.
400
401        @param devserver: url of the devserver.
402        @param timeout_sec: Number of seconds before time out the devserver
403                            call.
404        @param output: An output queue to save results to.
405        """
406        load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0)
407        if load:
408            load['devserver'] = devserver
409        output.put(load)
410
411
412    @classmethod
413    def get_devserver_load(cls, devserver,
414                           timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
415        """Returns True if the |devserver| is healthy to stage build.
416
417        @param devserver: url of the devserver.
418        @param timeout_min: How long to wait in minutes before deciding the
419                            the devserver is not up (float).
420
421        @return: A dictionary of the devserver's load.
422
423        """
424        call = cls._build_call(devserver, 'check_health')
425        @remote_devserver_call(timeout_min=timeout_min)
426        def get_load(devserver=devserver):
427            """Inner method that makes the call."""
428            return cls.run_call(call, timeout=timeout_min*60)
429
430        try:
431            return json.load(cStringIO.StringIO(get_load(devserver=devserver)))
432        except Exception as e:
433            logging.error('Devserver call failed: "%s", timeout: %s seconds,'
434                          ' Error: %s', call, timeout_min * 60, e)
435
436
437    @classmethod
438    def is_free_disk_ok(cls, load):
439        """Check if a devserver has enough free disk.
440
441        @param load: A dict of the load of the devserver.
442
443        @return: True if the devserver has enough free disk or disk check is
444                 skipped in global config.
445
446        """
447        if SKIP_DEVSERVER_HEALTH_CHECK:
448            logging.debug('devserver health check is skipped.')
449        elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB:
450            return False
451
452        return True
453
454
455    @classmethod
456    def is_apache_client_count_ok(cls, load):
457        """Check if a devserver has enough Apache connections available.
458
459        Apache server by default has maximum of 150 concurrent connections. If
460        a devserver has too many live connections, it likely indicates the
461        server is busy handling many long running download requests, e.g.,
462        downloading stateful partitions. It is better not to add more requests
463        to it.
464
465        @param load: A dict of the load of the devserver.
466
467        @return: True if the devserver has enough Apache connections available,
468                 or disk check is skipped in global config.
469
470        """
471        if SKIP_DEVSERVER_HEALTH_CHECK:
472            logging.debug('devserver health check is skipped.')
473        elif cls.APACHE_CLIENT_COUNT not in load:
474            logging.debug('Apache client count is not collected from devserver.')
475        elif (load[cls.APACHE_CLIENT_COUNT] >
476              cls._MAX_APACHE_CLIENT_COUNT):
477            return False
478
479        return True
480
481
482    @classmethod
483    def devserver_healthy(cls, devserver,
484                          timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
485        """Returns True if the |devserver| is healthy to stage build.
486
487        @param devserver: url of the devserver.
488        @param timeout_min: How long to wait in minutes before deciding the
489                            the devserver is not up (float).
490
491        @return: True if devserver is healthy. Return False otherwise.
492
493        """
494        c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy')
495        reason = ''
496        healthy = False
497        load = cls.get_devserver_load(devserver, timeout_min=timeout_min)
498        try:
499            if not load:
500                # Failed to get the load of devserver.
501                reason = '(1) Failed to get load.'
502                return False
503
504            apache_ok = cls.is_apache_client_count_ok(load)
505            if not apache_ok:
506                reason = '(2) Apache client count too high.'
507                logging.error('Devserver check_health failed. Live Apache client '
508                              'count is too high: %d.',
509                              load[cls.APACHE_CLIENT_COUNT])
510                return False
511
512            disk_ok = cls.is_free_disk_ok(load)
513            if not disk_ok:
514                reason = '(3) Disk space too low.'
515                logging.error('Devserver check_health failed. Free disk space is '
516                              'low. Only %dGB is available.',
517                              load[cls.FREE_DISK])
518            healthy = bool(disk_ok)
519            return disk_ok
520        finally:
521            c.increment(fields={'dev_server': cls(devserver).resolved_hostname,
522                                'healthy': healthy,
523                                'reason': reason})
524            # Monitor how many AU processes the devserver is currently running.
525            if load is not None and load.get(DevServer.AU_PROCESS):
526                c_au = metrics.Gauge(
527                        'chromeos/autotest/devserver/devserver_au_count')
528                c_au.set(
529                    load.get(DevServer.AU_PROCESS),
530                    fields={'dev_server': cls(devserver).resolved_hostname})
531
532
533    @staticmethod
534    def _build_call(host, method, **kwargs):
535        """Build a URL to |host| that calls |method|, passing |kwargs|.
536
537        Builds a URL that calls |method| on the dev server defined by |host|,
538        passing a set of key/value pairs built from the dict |kwargs|.
539
540        @param host: a string that is the host basename e.g. http://server:90.
541        @param method: the dev server method to call.
542        @param kwargs: a dict mapping arg names to arg values.
543        @return the URL string.
544        """
545        argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems()))
546        return "%(host)s/%(method)s?%(argstr)s" % dict(
547                host=host, method=method, argstr=argstr)
548
549
550    def build_call(self, method, **kwargs):
551        """Builds a devserver RPC string that is used by 'run_call()'.
552
553        @param method: remote devserver method to call.
554        """
555        return self._build_call(self._devserver, method, **kwargs)
556
557
558    @classmethod
559    def build_all_calls(cls, method, **kwargs):
560        """Builds a list of URLs that makes RPC calls on all devservers.
561
562        Build a URL that calls |method| on the dev server, passing a set
563        of key/value pairs built from the dict |kwargs|.
564
565        @param method: the dev server method to call.
566        @param kwargs: a dict mapping arg names to arg values
567
568        @return the URL string
569        """
570        calls = []
571        # Note we use cls.servers as servers is class specific.
572        for server in cls.servers():
573            if cls.devserver_healthy(server):
574                calls.append(cls._build_call(server, method, **kwargs))
575
576        return calls
577
578
579    @classmethod
580    def run_call(cls, call, readline=False, timeout=None):
581        """Invoke a given devserver call using urllib.open.
582
583        Open the URL with HTTP, and return the text of the response. Exceptions
584        may be raised as for urllib2.urlopen().
585
586        @param call: a url string that calls a method to a devserver.
587        @param readline: whether read http response line by line.
588        @param timeout: The timeout seconds for this urlopen call.
589
590        @return the results of this call.
591        """
592        if timeout is not None:
593            return utils.urlopen_socket_timeout(
594                    call, timeout=timeout).read()
595        elif readline:
596            response = urllib2.urlopen(call)
597            return [line.rstrip() for line in response]
598        else:
599            return urllib2.urlopen(call).read()
600
601
602    @staticmethod
603    def servers():
604        """Returns a list of servers that can serve as this type of server."""
605        raise NotImplementedError()
606
607
608    @classmethod
609    def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT,
610                                      unrestricted_only=False):
611        """Get the devservers in the same subnet of the given ip.
612
613        @param ip: The IP address of a dut to look for devserver.
614        @param mask_bits: Number of mask bits. Default is 19.
615        @param unrestricted_only: Set to True to select from devserver in
616                unrestricted subnet only. Default is False.
617
618        @return: A list of devservers in the same subnet of the given ip.
619
620        """
621        # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so
622        # we need a dict to return the full devserver path once the IPs are
623        # filtered in get_servers_in_same_subnet.
624        server_names = {}
625        all_devservers = []
626        devservers = (cls.get_unrestricted_devservers() if unrestricted_only
627                      else cls.servers())
628        for server in devservers:
629            server_name = get_hostname(server)
630            server_names[server_name] = server
631            all_devservers.append(server_name)
632        if not all_devservers:
633            devserver_type = 'unrestricted only' if unrestricted_only else 'all'
634            raise DevServerFailToLocateException(
635                'Fail to locate a devserver for dut %s in %s devservers'
636                % (ip, devserver_type))
637
638        devservers = utils.get_servers_in_same_subnet(ip, mask_bits,
639                                                      all_devservers)
640        return [server_names[s] for s in devservers]
641
642
643    @classmethod
644    def get_unrestricted_devservers(
645                cls, restricted_subnets=utils.RESTRICTED_SUBNETS):
646        """Get the devservers not in any restricted subnet specified in
647        restricted_subnets.
648
649        @param restricted_subnets: A list of restriected subnets.
650
651        @return: A list of devservers not in any restricted subnet.
652
653        """
654        if not restricted_subnets:
655            return cls.servers()
656
657        devservers = []
658        for server in cls.servers():
659            server_name = get_hostname(server)
660            if not utils.get_restricted_subnet(server_name, restricted_subnets):
661                devservers.append(server)
662        return devservers
663
664
665    @classmethod
666    def get_healthy_devserver(cls, build, devservers, ban_list=None):
667        """"Get a healthy devserver instance from the list of devservers.
668
669        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
670        @param devservers: The devserver list to be chosen out a healthy one.
671        @param ban_list: The blacklist of devservers we don't want to choose.
672                Default is None.
673
674        @return: A DevServer object of a healthy devserver. Return None if no
675                healthy devserver is found.
676
677        """
678        logging.debug('Pick one healthy devserver from %r', devservers)
679        while devservers:
680            hash_index = hash(build) % len(devservers)
681            devserver = devservers.pop(hash_index)
682            logging.debug('Check health for %s', devserver)
683            if ban_list and devserver in ban_list:
684                continue
685
686            if cls.devserver_healthy(devserver):
687                logging.debug('Pick %s', devserver)
688                return cls(devserver)
689
690
691    @classmethod
692    def get_available_devservers(cls, hostname=None,
693                                 prefer_local_devserver=PREFER_LOCAL_DEVSERVER,
694                                 restricted_subnets=utils.RESTRICTED_SUBNETS):
695        """Get devservers in the same subnet of the given hostname.
696
697        @param hostname: Hostname of a DUT to choose devserver for.
698
699        @return: A tuple of (devservers, can_retry), devservers is a list of
700                 devservers that's available for the given hostname. can_retry
701                 is a flag that indicate if caller can retry the selection of
702                 devserver if no devserver in the returned devservers can be
703                 used. For example, if hostname is in a restricted subnet,
704                 can_retry will be False.
705        """
706        logging.info('Getting devservers for host: %s',  hostname)
707        host_ip = None
708        if hostname:
709            host_ip = bin_utils.get_ip_address(hostname)
710            if not host_ip:
711                logging.error('Failed to get IP address of %s. Will pick a '
712                              'devserver without subnet constraint.', hostname)
713
714        if not host_ip:
715            return cls.get_unrestricted_devservers(restricted_subnets), False
716
717        # Go through all restricted subnet settings and check if the DUT is
718        # inside a restricted subnet. If so, only return the devservers in the
719        # restricted subnet and doesn't allow retry.
720        if host_ip and restricted_subnets:
721            for subnet_ip, mask_bits in restricted_subnets:
722                if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits):
723                    logging.debug('The host %s (%s) is in a restricted subnet. '
724                                  'Try to locate a devserver inside subnet '
725                                  '%s:%d.', hostname, host_ip, subnet_ip,
726                                  mask_bits)
727                    devservers = cls.get_devservers_in_same_subnet(
728                            subnet_ip, mask_bits)
729                    return devservers, False
730
731        # If prefer_local_devserver is set to True and the host is not in
732        # restricted subnet, pick a devserver in the same subnet if possible.
733        # Set can_retry to True so it can pick a different devserver if all
734        # devservers in the same subnet are down.
735        if prefer_local_devserver:
736            return (cls.get_devservers_in_same_subnet(
737                    host_ip, DEFAULT_SUBNET_MASKBIT, True), True)
738
739        return cls.get_unrestricted_devservers(restricted_subnets), False
740
741
742    @classmethod
743    def resolve(cls, build, hostname=None, ban_list=None):
744        """"Resolves a build to a devserver instance.
745
746        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
747        @param hostname: The hostname of dut that requests a devserver. It's
748                         used to make sure a devserver in the same subnet is
749                         preferred.
750        @param ban_list: The blacklist of devservers shouldn't be chosen.
751
752        @raise DevServerException: If no devserver is available.
753        """
754        tried_devservers = set()
755        devservers, can_retry = cls.get_available_devservers(hostname)
756        if devservers:
757            tried_devservers |= set(devservers)
758
759        devserver = cls.get_healthy_devserver(build, devservers,
760                                              ban_list=ban_list)
761
762        if not devserver and can_retry:
763            # Find available devservers without dut location constrain.
764            devservers, _ = cls.get_available_devservers()
765            devserver = cls.get_healthy_devserver(build, devservers,
766                                                  ban_list=ban_list)
767            if devservers:
768                tried_devservers |= set(devservers)
769        if devserver:
770            return devserver
771        else:
772            error_msg = ('All devservers are currently down: %s. '
773                         'dut hostname: %s' %
774                         (tried_devservers, hostname))
775            logging.error(error_msg)
776            raise DevServerException(error_msg)
777
778
779    @classmethod
780    def random(cls):
781        """Return a random devserver that's available.
782
783        Devserver election in `resolve` method is based on a hash of the
784        build that a caller wants to stage. The purpose is that different
785        callers requesting for the same build can get the same devserver,
786        while the lab is able to distribute different builds across all
787        devservers. That helps to reduce the duplication of builds across
788        all devservers.
789        This function returns a random devserver, by passing a random
790        pseudo build name to `resolve `method.
791        """
792        return cls.resolve(build=str(time.time()))
793
794
795class CrashServer(DevServer):
796    """Class of DevServer that symbolicates crash dumps."""
797
798    @staticmethod
799    def servers():
800        return _get_crash_server_list()
801
802
803    @remote_devserver_call()
804    def symbolicate_dump(self, minidump_path, build):
805        """Ask the devserver to symbolicate the dump at minidump_path.
806
807        Stage the debug symbols for |build| and, if that works, ask the
808        devserver to symbolicate the dump at |minidump_path|.
809
810        @param minidump_path: the on-disk path of the minidump.
811        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
812                      whose debug symbols are needed for symbolication.
813        @return The contents of the stack trace
814        @raise DevServerException upon any return code that's not HTTP OK.
815        """
816        try:
817            import requests
818        except ImportError:
819            logging.warning("Can't 'import requests' to connect to dev server.")
820            return ''
821        f = {'dev_server': self.resolved_hostname}
822        c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump')
823        c.increment(fields=f)
824        # Symbolicate minidump.
825        m = 'chromeos/autotest/crashserver/symbolicate_dump_duration'
826        with metrics.SecondsTimer(m, fields=f):
827            call = self.build_call('symbolicate_dump',
828                                   archive_url=_get_image_storage_server() + build)
829            request = requests.post(
830                    call, files={'minidump': open(minidump_path, 'rb')})
831            if request.status_code == requests.codes.OK:
832                return request.text
833
834        error_fd = cStringIO.StringIO(request.text)
835        raise urllib2.HTTPError(
836                call, request.status_code, request.text, request.headers,
837                error_fd)
838
839
840    @classmethod
841    def get_available_devservers(cls, hostname):
842        """Get all available crash servers.
843
844        Crash server election doesn't need to count the location of hostname.
845
846        @param hostname: Hostname of a DUT to choose devserver for.
847
848        @return: A tuple of (all crash servers, False). can_retry is set to
849                 False, as all crash servers are returned. There is no point to
850                 retry.
851        """
852        return cls.servers(), False
853
854
855class ImageServerBase(DevServer):
856    """Base class for devservers used to stage builds.
857
858    CrOS and Android builds are staged in different ways as they have different
859    sets of artifacts. This base class abstracts the shared functions between
860    the two types of ImageServer.
861    """
862
863    @classmethod
864    def servers(cls):
865        """Returns a list of servers that can serve as a desired type of
866        devserver.
867        """
868        return _get_dev_server_list()
869
870
871    def _get_image_url(self, image):
872        """Returns the url of the directory for this image on the devserver.
873
874        @param image: the image that was fetched.
875        """
876        image = self.translate(image)
877        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
878                                              type=str)
879        return (url_pattern % (self.url(), image)).replace('update', 'static')
880
881
882    @staticmethod
883    def create_metadata(server_name, image, artifacts=None, files=None):
884        """Create a metadata dictionary given the staged items.
885
886        The metadata can be send to metadata db along with stats.
887
888        @param server_name: name of the devserver, e.g 172.22.33.44.
889        @param image: The name of the image.
890        @param artifacts: A list of artifacts.
891        @param files: A list of files.
892
893        @return A metadata dictionary.
894
895        """
896        metadata = {'devserver': server_name,
897                    'image': image,
898                    '_type': 'devserver'}
899        if artifacts:
900            metadata['artifacts'] = ' '.join(artifacts)
901        if files:
902            metadata['files'] = ' '.join(files)
903        return metadata
904
905
906    @classmethod
907    def run_ssh_call(cls, call, readline=False, timeout=None):
908        """Construct an ssh-based rpc call, and execute it.
909
910        @param call: a url string that calls a method to a devserver.
911        @param readline: whether read http response line by line.
912        @param timeout: The timeout seconds for ssh call.
913
914        @return the results of this call.
915        """
916        hostname = get_hostname(call)
917        ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call))
918        timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60
919        try:
920            result = utils.run(ssh_call, timeout=timeout_seconds)
921        except error.CmdError as e:
922            logging.debug('Error occurred with exit_code %d when executing the '
923                          'ssh call: %s.', e.result_obj.exit_status,
924                          e.result_obj.stderr)
925            c = metrics.Counter('chromeos/autotest/devserver/ssh_failure')
926            c.increment(fields={'dev_server': hostname})
927            raise
928        response = result.stdout
929
930        # If the curl command's returned HTTP response contains certain
931        # exception string, raise the DevServerException of the response.
932        if 'DownloaderException' in response:
933            raise DevServerException(_strip_http_message(response))
934
935        if readline:
936            # Remove line terminators and trailing whitespace
937            response = response.splitlines()
938            return [line.rstrip() for line in response]
939
940        return response
941
942
943    @classmethod
944    def run_call(cls, call, readline=False, timeout=None):
945        """Invoke a given devserver call using urllib.open or ssh.
946
947        Open the URL with HTTP or SSH-based HTTP, and return the text of the
948        response. Exceptions may be raised as for urllib2.urlopen() or
949        utils.run().
950
951        @param call: a url string that calls a method to a devserver.
952        @param readline: whether read http response line by line.
953        @param timeout: The timeout seconds for urlopen call or ssh call.
954
955        @return the results of this call.
956        """
957        server_name = get_hostname(call)
958        is_in_restricted_subnet = utils.get_restricted_subnet(
959                server_name, utils.RESTRICTED_SUBNETS)
960        if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or
961            not is_in_restricted_subnet):
962            return super(ImageServerBase, cls).run_call(
963                    call, readline=readline, timeout=timeout)
964        else:
965            return cls.run_ssh_call(
966                    call, readline=readline, timeout=timeout)
967
968
969    @classmethod
970    def download_file(cls, remote_file, local_file, timeout=None):
971        """Download file from devserver.
972
973        The format of remote_file should be:
974            http://devserver_ip:8082/static/board/...
975
976        @param remote_file: The URL of the file on devserver that need to be
977            downloaded.
978        @param local_file: The path of the file saved to local.
979        @param timeout: The timeout seconds for this call.
980        """
981        response = cls.run_call(remote_file, timeout=timeout)
982        with open(local_file, 'w') as out_log:
983            out_log.write(response)
984
985
986    def _poll_is_staged(self, **kwargs):
987        """Polling devserver.is_staged until all artifacts are staged.
988
989        @param kwargs: keyword arguments to make is_staged devserver call.
990
991        @return: True if all artifacts are staged in devserver.
992        """
993        call = self.build_call('is_staged', **kwargs)
994
995        def all_staged():
996            """Call devserver.is_staged rpc to check if all files are staged.
997
998            @return: True if all artifacts are staged in devserver. False
999                     otherwise.
1000            @rasies DevServerException, the exception is a wrapper of all
1001                    exceptions that were raised when devserver tried to download
1002                    the artifacts. devserver raises an HTTPError or a CmdError
1003                    when an exception was raised in the code. Such exception
1004                    should be re-raised here to stop the caller from waiting.
1005                    If the call to devserver failed for connection issue, a
1006                    URLError exception is raised, and caller should retry the
1007                    call to avoid such network flakiness.
1008
1009            """
1010            try:
1011                result = self.run_call(call)
1012                logging.debug('whether artifact is staged: %r', result)
1013                return result == 'True'
1014            except urllib2.HTTPError as e:
1015                error_markup = e.read()
1016                raise DevServerException(_strip_http_message(error_markup))
1017            except urllib2.URLError as e:
1018                # Could be connection issue, retry it.
1019                # For example: <urlopen error [Errno 111] Connection refused>
1020                logging.error('URLError happens in is_stage: %r', e)
1021                return False
1022            except error.CmdError as e:
1023                # Retry if SSH failed to connect to the devserver.
1024                logging.warning('CmdError happens in is_stage: %r, will retry', e)
1025                return False
1026
1027        bin_utils.poll_for_condition(
1028                all_staged,
1029                exception=bin_utils.TimeoutError(),
1030                timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60,
1031                sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL)
1032
1033        return True
1034
1035
1036    def _call_and_wait(self, call_name, error_message,
1037                       expected_response=SUCCESS, **kwargs):
1038        """Helper method to make a urlopen call, and wait for artifacts staged.
1039
1040        @param call_name: name of devserver rpc call.
1041        @param error_message: Error message to be thrown if response does not
1042                              match expected_response.
1043        @param expected_response: Expected response from rpc, default to
1044                                  |Success|. If it's set to None, do not compare
1045                                  the actual response. Any response is consider
1046                                  to be good.
1047        @param kwargs: keyword arguments to make is_staged devserver call.
1048
1049        @return: The response from rpc.
1050        @raise DevServerException upon any return code that's expected_response.
1051
1052        """
1053        call = self.build_call(call_name, async=True, **kwargs)
1054        try:
1055            response = self.run_call(call)
1056            logging.debug('response for RPC: %r', response)
1057            if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response:
1058                logging.debug('Proxy error happens in RPC call, '
1059                              'will retry in 30 seconds')
1060                time.sleep(30)
1061                raise DevServerOverloadException()
1062        except httplib.BadStatusLine as e:
1063            logging.error(e)
1064            raise DevServerException('Received Bad Status line, Devserver %s '
1065                                     'might have gone down while handling '
1066                                     'the call: %s' % (self.url(), call))
1067
1068        if expected_response and not response == expected_response:
1069                raise DevServerException(error_message)
1070
1071        # `os_type` is needed in build a devserver call, but not needed for
1072        # wait_for_artifacts_staged, since that method is implemented by
1073        # each ImageServerBase child class.
1074        if 'os_type' in kwargs:
1075            del kwargs['os_type']
1076        self.wait_for_artifacts_staged(**kwargs)
1077        return response
1078
1079
1080    def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs):
1081        """Tell the devserver to download and stage |artifacts| from |image|
1082        specified by kwargs.
1083
1084        This is the main call point for staging any specific artifacts for a
1085        given build. To see the list of artifacts one can stage see:
1086
1087        ~src/platfrom/dev/artifact_info.py.
1088
1089        This is maintained along with the actual devserver code.
1090
1091        @param artifacts: A list of artifacts.
1092        @param files: A list of files to stage.
1093        @param archive_url: Optional parameter that has the archive_url to stage
1094                this artifact from. Default is specified in autotest config +
1095                image.
1096        @param kwargs: keyword arguments that specify the build information, to
1097                make stage devserver call.
1098
1099        @raise DevServerException upon any return code that's not HTTP OK.
1100        """
1101        if not archive_url:
1102            archive_url = _get_storage_server_for_artifacts(artifacts) + build
1103
1104        artifacts_arg = ','.join(artifacts) if artifacts else ''
1105        files_arg = ','.join(files) if files else ''
1106        error_message = ("staging %s for %s failed;"
1107                         "HTTP OK not accompanied by 'Success'." %
1108                         ('artifacts=%s files=%s ' % (artifacts_arg, files_arg),
1109                          build))
1110
1111        staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' %
1112                        (build, artifacts, files, archive_url))
1113        logging.info('Staging artifacts on devserver %s: %s',
1114                     self.url(), staging_info)
1115        success = False
1116        try:
1117            arguments = {'archive_url': archive_url,
1118                         'artifacts': artifacts_arg,
1119                         'files': files_arg}
1120            if kwargs:
1121                arguments.update(kwargs)
1122            # TODO(akeshet): canonicalize artifacts_arg before using it as a
1123            # metric field (as it stands it is a not-very-well-controlled
1124            # string).
1125            f = {'artifacts': artifacts_arg,
1126                 'dev_server': self.resolved_hostname}
1127            with metrics.SecondsTimer(
1128                    'chromeos/autotest/devserver/stage_artifact_duration',
1129                    fields=f):
1130                self.call_and_wait(call_name='stage', error_message=error_message,
1131                                   **arguments)
1132            logging.info('Finished staging artifacts: %s', staging_info)
1133            success = True
1134        except (bin_utils.TimeoutError, error.TimeoutException):
1135            logging.error('stage_artifacts timed out: %s', staging_info)
1136            raise DevServerException(
1137                    'stage_artifacts timed out: %s' % staging_info)
1138        finally:
1139            f = {'success': success,
1140                 'artifacts': artifacts_arg,
1141                 'dev_server': self.resolved_hostname}
1142            metrics.Counter('chromeos/autotest/devserver/stage_artifact'
1143                            ).increment(fields=f)
1144
1145
1146    def call_and_wait(self, *args, **kwargs):
1147        """Helper method to make a urlopen call, and wait for artifacts staged.
1148
1149        This method needs to be overridden in the subclass to implement the
1150        logic to call _call_and_wait.
1151        """
1152        raise NotImplementedError
1153
1154
1155    def _trigger_download(self, build, artifacts, files, synchronous=True,
1156                          **kwargs_build_info):
1157        """Tell the devserver to download and stage image specified in
1158        kwargs_build_info.
1159
1160        Tells the devserver to fetch |image| from the image storage server
1161        named by _get_image_storage_server().
1162
1163        If |synchronous| is True, waits for the entire download to finish
1164        staging before returning. Otherwise only the artifacts necessary
1165        to start installing images onto DUT's will be staged before returning.
1166        A caller can then call finish_download to guarantee the rest of the
1167        artifacts have finished staging.
1168
1169        @param synchronous: if True, waits until all components of the image are
1170               staged before returning.
1171        @param kwargs_build_info: Dictionary of build information.
1172                For CrOS, it is None as build is the CrOS image name.
1173                For Android, it is {'target': target,
1174                                    'build_id': build_id,
1175                                    'branch': branch}
1176
1177        @raise DevServerException upon any return code that's not HTTP OK.
1178
1179        """
1180        if kwargs_build_info:
1181            archive_url = None
1182        else:
1183            archive_url = _get_image_storage_server() + build
1184        error_message = ("trigger_download for %s failed;"
1185                         "HTTP OK not accompanied by 'Success'." % build)
1186        kwargs = {'archive_url': archive_url,
1187                  'artifacts': artifacts,
1188                  'files': files,
1189                  'error_message': error_message}
1190        if kwargs_build_info:
1191            kwargs.update(kwargs_build_info)
1192
1193        logging.info('trigger_download starts for %s', build)
1194        try:
1195            response = self.call_and_wait(call_name='stage', **kwargs)
1196            logging.info('trigger_download finishes for %s', build)
1197        except (bin_utils.TimeoutError, error.TimeoutException):
1198            logging.error('trigger_download timed out for %s.', build)
1199            raise DevServerException(
1200                    'trigger_download timed out for %s.' % build)
1201        was_successful = response == SUCCESS
1202        if was_successful and synchronous:
1203            self._finish_download(build, artifacts, files, **kwargs_build_info)
1204
1205
1206    def _finish_download(self, build, artifacts, files, **kwargs_build_info):
1207        """Tell the devserver to finish staging image specified in
1208        kwargs_build_info.
1209
1210        If trigger_download is called with synchronous=False, it will return
1211        before all artifacts have been staged. This method contacts the
1212        devserver and blocks until all staging is completed and should be
1213        called after a call to trigger_download.
1214
1215        @param kwargs_build_info: Dictionary of build information.
1216                For CrOS, it is None as build is the CrOS image name.
1217                For Android, it is {'target': target,
1218                                    'build_id': build_id,
1219                                    'branch': branch}
1220
1221        @raise DevServerException upon any return code that's not HTTP OK.
1222        """
1223        archive_url = _get_image_storage_server() + build
1224        error_message = ("finish_download for %s failed;"
1225                         "HTTP OK not accompanied by 'Success'." % build)
1226        kwargs = {'archive_url': archive_url,
1227                  'artifacts': artifacts,
1228                  'files': files,
1229                  'error_message': error_message}
1230        if kwargs_build_info:
1231            kwargs.update(kwargs_build_info)
1232        try:
1233            self.call_and_wait(call_name='stage', **kwargs)
1234        except (bin_utils.TimeoutError, error.TimeoutException):
1235            logging.error('finish_download timed out for %s', build)
1236            raise DevServerException(
1237                    'finish_download timed out for %s.' % build)
1238
1239
1240    @remote_devserver_call()
1241    def locate_file(self, file_name, artifacts, build, build_info):
1242        """Locate a file with the given file_name on devserver.
1243
1244        This method calls devserver RPC `locate_file` to look up a file with
1245        the given file name inside specified build artifacts.
1246
1247        @param file_name: Name of the file to look for a file.
1248        @param artifacts: A list of artifact names to search for the file.
1249        @param build: Name of the build. For Android, it's None as build_info
1250                should be used.
1251        @param build_info: Dictionary of build information.
1252                For CrOS, it is None as build is the CrOS image name.
1253                For Android, it is {'target': target,
1254                                    'build_id': build_id,
1255                                    'branch': branch}
1256
1257        @return: A devserver url to the file.
1258        @raise DevServerException upon any return code that's not HTTP OK.
1259        """
1260        if not build and not build_info:
1261            raise DevServerException('You must specify build information to '
1262                                     'look for file %s in artifacts %s.' %
1263                                     (file_name, artifacts))
1264        kwargs = {'file_name': file_name,
1265                  'artifacts': artifacts}
1266        if build_info:
1267            build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info
1268            kwargs.update(build_info)
1269            # Devserver treats Android and Brillo build in the same way as they
1270            # are both retrieved from Launch Control and have similar build
1271            # artifacts. Therefore, os_type for devserver calls is `android` for
1272            # both Android and Brillo builds.
1273            kwargs['os_type'] = 'android'
1274        else:
1275            build_path = build
1276            kwargs['build'] = build
1277        call = self.build_call('locate_file', async=False, **kwargs)
1278        try:
1279            file_path = self.run_call(call)
1280            return os.path.join(self.url(), 'static', build_path, file_path)
1281        except httplib.BadStatusLine as e:
1282            logging.error(e)
1283            raise DevServerException('Received Bad Status line, Devserver %s '
1284                                     'might have gone down while handling '
1285                                     'the call: %s' % (self.url(), call))
1286
1287
1288    @remote_devserver_call()
1289    def list_control_files(self, build, suite_name=''):
1290        """Ask the devserver to list all control files for |build|.
1291
1292        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1293                      whose control files the caller wants listed.
1294        @param suite_name: The name of the suite for which we require control
1295                           files.
1296        @return None on failure, or a list of control file paths
1297                (e.g. server/site_tests/autoupdate/control)
1298        @raise DevServerException upon any return code that's not HTTP OK.
1299        """
1300        build = self.translate(build)
1301        call = self.build_call('controlfiles', build=build,
1302                               suite_name=suite_name)
1303        return self.run_call(call, readline=True)
1304
1305
1306    @remote_devserver_call()
1307    def get_control_file(self, build, control_path):
1308        """Ask the devserver for the contents of a control file.
1309
1310        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1311                      whose control file the caller wants to fetch.
1312        @param control_path: The file to fetch
1313                             (e.g. server/site_tests/autoupdate/control)
1314        @return The contents of the desired file.
1315        @raise DevServerException upon any return code that's not HTTP OK.
1316        """
1317        build = self.translate(build)
1318        call = self.build_call('controlfiles', build=build,
1319                               control_path=control_path)
1320        return self.run_call(call)
1321
1322
1323    @remote_devserver_call()
1324    def list_suite_controls(self, build, suite_name=''):
1325        """Ask the devserver to list contents of all control files for |build|.
1326
1327        @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
1328                      whose control files' contents the caller wants returned.
1329        @param suite_name: The name of the suite for which we require control
1330                           files.
1331        @return None on failure, or a dict of contents of all control files
1332            (e.g. {'path1': "#Copyright controls ***", ...,
1333                pathX': "#Copyright controls ***"}
1334        @raise DevServerException upon any return code that's not HTTP OK.
1335        """
1336        build = self.translate(build)
1337        call = self.build_call('list_suite_controls', build=build,
1338                               suite_name=suite_name)
1339        return json.load(cStringIO.StringIO(self.run_call(call)))
1340
1341
1342class ImageServer(ImageServerBase):
1343    """Class for DevServer that handles RPCs related to CrOS images.
1344
1345    The calls to devserver to stage artifacts, including stage and download, are
1346    made in async mode. That is, when caller makes an RPC |stage| to request
1347    devserver to stage certain artifacts, devserver handles the call and starts
1348    staging artifacts in a new thread, and return |Success| without waiting for
1349    staging being completed. When caller receives message |Success|, it polls
1350    devserver's is_staged call until all artifacts are staged.
1351    Such mechanism is designed to prevent cherrypy threads in devserver being
1352    running out, as staging artifacts might take long time, and cherrypy starts
1353    with a fixed number of threads that handle devserver rpc.
1354    """
1355
1356    class ArtifactUrls(object):
1357        """A container for URLs of staged artifacts.
1358
1359        Attributes:
1360            full_payload: URL for downloading a staged full release update
1361            mton_payload: URL for downloading a staged M-to-N release update
1362            nton_payload: URL for downloading a staged N-to-N release update
1363
1364        """
1365        def __init__(self, full_payload=None, mton_payload=None,
1366                     nton_payload=None):
1367            self.full_payload = full_payload
1368            self.mton_payload = mton_payload
1369            self.nton_payload = nton_payload
1370
1371
1372    def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''):
1373        """Polling devserver.is_staged until all artifacts are staged.
1374
1375        @param archive_url: Google Storage URL for the build.
1376        @param artifacts: Comma separated list of artifacts to download.
1377        @param files: Comma separated list of files to download.
1378        @return: True if all artifacts are staged in devserver.
1379        """
1380        kwargs = {'archive_url': archive_url,
1381                  'artifacts': artifacts,
1382                  'files': files}
1383        return self._poll_is_staged(**kwargs)
1384
1385
1386    @remote_devserver_call()
1387    def call_and_wait(self, call_name, archive_url, artifacts, files,
1388                      error_message, expected_response=SUCCESS):
1389        """Helper method to make a urlopen call, and wait for artifacts staged.
1390
1391        @param call_name: name of devserver rpc call.
1392        @param archive_url: Google Storage URL for the build..
1393        @param artifacts: Comma separated list of artifacts to download.
1394        @param files: Comma separated list of files to download.
1395        @param expected_response: Expected response from rpc, default to
1396                                  |Success|. If it's set to None, do not compare
1397                                  the actual response. Any response is consider
1398                                  to be good.
1399        @param error_message: Error message to be thrown if response does not
1400                              match expected_response.
1401
1402        @return: The response from rpc.
1403        @raise DevServerException upon any return code that's expected_response.
1404
1405        """
1406        kwargs = {'archive_url': archive_url,
1407                  'artifacts': artifacts,
1408                  'files': files}
1409        return self._call_and_wait(call_name, error_message,
1410                                   expected_response, **kwargs)
1411
1412
1413    @remote_devserver_call()
1414    def stage_artifacts(self, image=None, artifacts=None, files='',
1415                        archive_url=None):
1416        """Tell the devserver to download and stage |artifacts| from |image|.
1417
1418         This is the main call point for staging any specific artifacts for a
1419        given build. To see the list of artifacts one can stage see:
1420
1421        ~src/platfrom/dev/artifact_info.py.
1422
1423        This is maintained along with the actual devserver code.
1424
1425        @param image: the image to fetch and stage.
1426        @param artifacts: A list of artifacts.
1427        @param files: A list of files to stage.
1428        @param archive_url: Optional parameter that has the archive_url to stage
1429                this artifact from. Default is specified in autotest config +
1430                image.
1431
1432        @raise DevServerException upon any return code that's not HTTP OK.
1433        """
1434        if not artifacts and not files:
1435            raise DevServerException('Must specify something to stage.')
1436        image = self.translate(image)
1437        self._stage_artifacts(image, artifacts, files, archive_url)
1438
1439
1440    @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS)
1441    def list_image_dir(self, image):
1442        """List the contents of the image stage directory, on the devserver.
1443
1444        @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>.
1445
1446        @raise DevServerException upon any return code that's not HTTP OK.
1447        """
1448        image = self.translate(image)
1449        logging.info('Requesting contents from devserver %s for image %s',
1450                     self.url(), image)
1451        archive_url = _get_storage_server_for_artifacts() + image
1452        call = self.build_call('list_image_dir', archive_url=archive_url)
1453        response = self.run_call(call, readline=True)
1454        for line in response:
1455            logging.info(line)
1456
1457
1458    def trigger_download(self, image, synchronous=True):
1459        """Tell the devserver to download and stage |image|.
1460
1461        Tells the devserver to fetch |image| from the image storage server
1462        named by _get_image_storage_server().
1463
1464        If |synchronous| is True, waits for the entire download to finish
1465        staging before returning. Otherwise only the artifacts necessary
1466        to start installing images onto DUT's will be staged before returning.
1467        A caller can then call finish_download to guarantee the rest of the
1468        artifacts have finished staging.
1469
1470        @param image: the image to fetch and stage.
1471        @param synchronous: if True, waits until all components of the image are
1472               staged before returning.
1473
1474        @raise DevServerException upon any return code that's not HTTP OK.
1475
1476        """
1477        image = self.translate(image)
1478        artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE
1479        self._trigger_download(image, artifacts, files='',
1480                               synchronous=synchronous)
1481
1482
1483    @remote_devserver_call()
1484    def setup_telemetry(self, build):
1485        """Tell the devserver to setup telemetry for this build.
1486
1487        The devserver will stage autotest and then extract the required files
1488        for telemetry.
1489
1490        @param build: the build to setup telemetry for.
1491
1492        @returns path on the devserver that telemetry is installed to.
1493        """
1494        build = self.translate(build)
1495        archive_url = _get_image_storage_server() + build
1496        call = self.build_call('setup_telemetry', archive_url=archive_url)
1497        try:
1498            response = self.run_call(call)
1499        except httplib.BadStatusLine as e:
1500            logging.error(e)
1501            raise DevServerException('Received Bad Status line, Devserver %s '
1502                                     'might have gone down while handling '
1503                                     'the call: %s' % (self.url(), call))
1504        return response
1505
1506
1507    def finish_download(self, image):
1508        """Tell the devserver to finish staging |image|.
1509
1510        If trigger_download is called with synchronous=False, it will return
1511        before all artifacts have been staged. This method contacts the
1512        devserver and blocks until all staging is completed and should be
1513        called after a call to trigger_download.
1514
1515        @param image: the image to fetch and stage.
1516        @raise DevServerException upon any return code that's not HTTP OK.
1517        """
1518        image = self.translate(image)
1519        artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST
1520        self._finish_download(image, artifacts, files='')
1521
1522
1523    def get_update_url(self, image):
1524        """Returns the url that should be passed to the updater.
1525
1526        @param image: the image that was fetched.
1527        """
1528        image = self.translate(image)
1529        url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
1530                                              type=str)
1531        return (url_pattern % (self.url(), image))
1532
1533
1534    def get_staged_file_url(self, filename, image):
1535        """Returns the url of a staged file for this image on the devserver."""
1536        return '/'.join([self._get_image_url(image), filename])
1537
1538
1539    def get_full_payload_url(self, image):
1540        """Returns a URL to a staged full payload.
1541
1542        @param image: the image that was fetched.
1543
1544        @return A fully qualified URL that can be used for downloading the
1545                payload.
1546
1547        """
1548        return self._get_image_url(image) + '/update.gz'
1549
1550
1551    def get_test_image_url(self, image):
1552        """Returns a URL to a staged test image.
1553
1554        @param image: the image that was fetched.
1555
1556        @return A fully qualified URL that can be used for downloading the
1557                image.
1558
1559        """
1560        return self._get_image_url(image) + '/chromiumos_test_image.bin'
1561
1562
1563    @remote_devserver_call()
1564    def get_dependencies_file(self, build):
1565        """Ask the dev server for the contents of the suite dependencies file.
1566
1567        Ask the dev server at |self._dev_server| for the contents of the
1568        pre-processed suite dependencies file (at DEPENDENCIES_FILE)
1569        for |build|.
1570
1571        @param build: The build (e.g. x86-mario-release/R21-2333.0.0)
1572                      whose dependencies the caller is interested in.
1573        @return The contents of the dependencies file, which should eval to
1574                a dict of dicts, as per bin_utils/suite_preprocessor.py.
1575        @raise DevServerException upon any return code that's not HTTP OK.
1576        """
1577        build = self.translate(build)
1578        call = self.build_call('controlfiles',
1579                               build=build, control_path=DEPENDENCIES_FILE)
1580        return self.run_call(call)
1581
1582
1583    @remote_devserver_call()
1584    def get_latest_build_in_gs(self, board):
1585        """Ask the devservers for the latest offical build in Google Storage.
1586
1587        @param board: The board for who we want the latest official build.
1588        @return A string of the returned build rambi-release/R37-5868.0.0
1589        @raise DevServerException upon any return code that's not HTTP OK.
1590        """
1591        call = self.build_call(
1592                'xbuddy_translate/remote/%s/latest-official' % board,
1593                image_dir=_get_image_storage_server())
1594        image_name = self.run_call(call)
1595        return os.path.dirname(image_name)
1596
1597
1598    def translate(self, build_name):
1599        """Translate the build name if it's in LATEST format.
1600
1601        If the build name is in the format [builder]/LATEST, return the latest
1602        build in Google Storage otherwise return the build name as is.
1603
1604        @param build_name: build_name to check.
1605
1606        @return The actual build name to use.
1607        """
1608        match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I)
1609        if not match:
1610            return build_name
1611        translated_build = self.get_latest_build_in_gs(match.groups()[0])
1612        logging.debug('Translated relative build %s to %s', build_name,
1613                      translated_build)
1614        return translated_build
1615
1616
1617    @classmethod
1618    @remote_devserver_call()
1619    def get_latest_build(cls, target, milestone=''):
1620        """Ask all the devservers for the latest build for a given target.
1621
1622        @param target: The build target, typically a combination of the board
1623                       and the type of build e.g. x86-mario-release.
1624        @param milestone:  For latest build set to '', for builds only in a
1625                           specific milestone set to a str of format Rxx
1626                           (e.g. R16). Default: ''. Since we are dealing with a
1627                           webserver sending an empty string, '', ensures that
1628                           the variable in the URL is ignored as if it was set
1629                           to None.
1630        @return A string of the returned build e.g. R20-2226.0.0.
1631        @raise DevServerException upon any return code that's not HTTP OK.
1632        """
1633        calls = cls.build_all_calls('latestbuild', target=target,
1634                                    milestone=milestone)
1635        latest_builds = []
1636        for call in calls:
1637            latest_builds.append(cls.run_call(call))
1638
1639        return max(latest_builds, key=version.LooseVersion)
1640
1641
1642    @remote_devserver_call()
1643    def _kill_au_process_for_host(self, **kwargs):
1644        """Kill the triggerred auto_update process if error happens in cros_au.
1645
1646        @param kwargs: Arguments to make kill_au_proc devserver call.
1647        """
1648        call = self.build_call('kill_au_proc', **kwargs)
1649        response = self.run_call(call)
1650        if not response == 'True':
1651            raise DevServerException(
1652                    'Failed to kill the triggerred CrOS auto_update process'
1653                    'on devserver %s, the response is %s' % (
1654                            self.url(), response))
1655
1656
1657    def kill_au_process_for_host(self, host_name, pid):
1658        """Kill the triggerred auto_update process if error happens.
1659
1660        Usually this function is used to clear all potential left au processes
1661        of the given host name.
1662
1663        If pid is specified, the devserver will further check the given pid to
1664        make sure the process is killed. This is used for the case that the au
1665        process has started in background, but then provision fails due to
1666        some unknown issues very fast. In this case, when 'kill_au_proc' is
1667        called, there's no corresponding background track log created for this
1668        ongoing au process, which prevents this RPC call from killing this au
1669        process.
1670
1671        @param host_name: The DUT's hostname.
1672        @param pid: The ongoing au process's pid.
1673
1674        @return: True if successfully kill the auto-update process for host.
1675        """
1676        kwargs = {'host_name': host_name, 'pid': pid}
1677        try:
1678            self._kill_au_process_for_host(**kwargs)
1679        except DevServerException:
1680            return False
1681
1682        return True
1683
1684
1685    @remote_devserver_call()
1686    def _clean_track_log(self, **kwargs):
1687        """Clean track log for the current auto-update process."""
1688        call = self.build_call('handler_cleanup', **kwargs)
1689        self.run_call(call)
1690
1691
1692    def clean_track_log(self, host_name, pid):
1693        """Clean track log for the current auto-update process.
1694
1695        @param host_name: The host name to be updated.
1696        @param pid: The auto-update process id.
1697
1698        @return: True if track log is successfully cleaned, False otherwise.
1699        """
1700        if not pid:
1701            return False
1702
1703        kwargs = {'host_name': host_name, 'pid': pid}
1704        try:
1705            self._clean_track_log(**kwargs)
1706        except DevServerException as e:
1707            logging.debug('Failed to clean track_status_file on '
1708                          'devserver for host %s and process id %s: %s',
1709                          host_name, pid, str(e))
1710            return False
1711
1712        return True
1713
1714
1715    def _get_au_log_filename(self, log_dir, host_name, pid):
1716        """Return the auto-update log's filename."""
1717        return os.path.join(log_dir, CROS_AU_LOG_FILENAME % (
1718                    host_name, pid))
1719
1720    def _read_json_response_from_devserver(self, response):
1721        """Reads the json response from the devserver.
1722
1723        This is extracted to its own function so that it can be easily mocked.
1724        @param response: the response for a devserver.
1725        """
1726        try:
1727            return json.loads(response)
1728        except ValueError as e:
1729            raise DevServerException(e)
1730
1731
1732    @remote_devserver_call()
1733    def _collect_au_log(self, log_dir, **kwargs):
1734        """Collect logs from devserver after cros-update process is finished.
1735
1736        Collect the logs that recording the whole cros-update process, and
1737        write it to sysinfo path of a job.
1738
1739        The example log file name that is stored is like:
1740            '1220-repair/sysinfo/CrOS_update_host_name_pid.log'
1741
1742        @param host_name: the DUT's hostname.
1743        @param pid: the auto-update process id on devserver.
1744        @param log_dir: The directory to save the cros-update process log
1745                        retrieved from devserver.
1746        """
1747        call = self.build_call('collect_cros_au_log', **kwargs)
1748        response = self.run_call(call)
1749        if not os.path.exists(log_dir):
1750            os.mkdir(log_dir)
1751        write_file = self._get_au_log_filename(
1752                log_dir, kwargs['host_name'], kwargs['pid'])
1753        logging.debug('Saving auto-update logs into %s', write_file)
1754
1755        au_logs = self._read_json_response_from_devserver(response)
1756
1757        try:
1758            for k, v in au_logs['host_logs'].items():
1759                log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid'])
1760                log_path = os.path.join(log_dir, log_name)
1761                with open(log_path, 'w') as out_log:
1762                    out_log.write(v)
1763        except IOError as e:
1764            raise DevServerException('Failed to write auto-update hostlogs: '
1765                                     '%s' % e)
1766
1767        try:
1768            with open(write_file, 'w') as out_log:
1769                out_log.write(au_logs['cros_au_log'])
1770        except:
1771            raise DevServerException('Failed to write auto-update logs into '
1772                                     '%s' % write_file)
1773
1774
1775    def collect_au_log(self, host_name, pid, log_dir):
1776        """Collect logs from devserver after cros-update process is finished.
1777
1778        @param host_name: the DUT's hostname.
1779        @param pid: the auto-update process id on devserver.
1780        @param log_dir: The directory to save the cros-update process log
1781                        retrieved from devserver.
1782
1783        @return: True if auto-update log is successfully collected, False
1784          otherwise.
1785        """
1786        if not pid:
1787            return False
1788
1789        kwargs = {'host_name': host_name, 'pid': pid}
1790        try:
1791            self._collect_au_log(log_dir, **kwargs)
1792        except DevServerException as e:
1793            logging.debug('Failed to collect auto-update log on '
1794                          'devserver for host %s and process id %s: %s',
1795                          host_name, pid, str(e))
1796            return False
1797
1798        return True
1799
1800
1801    @remote_devserver_call()
1802    def _trigger_auto_update(self, **kwargs):
1803        """Trigger auto-update by calling devserver.cros_au.
1804
1805        @param kwargs:  Arguments to make cros_au devserver call.
1806
1807        @return: a tuple indicates whether the RPC call cros_au succeeds and
1808          the auto-update process id running on devserver.
1809        """
1810        host_name = kwargs['host_name']
1811        call = self.build_call('cros_au', async=True, **kwargs)
1812        try:
1813            response = self.run_call(call)
1814            logging.info(
1815                'Received response from devserver for cros_au call: %r',
1816                response)
1817        except httplib.BadStatusLine as e:
1818            logging.error(e)
1819            raise DevServerException('Received Bad Status line, Devserver %s '
1820                                     'might have gone down while handling '
1821                                     'the call: %s' % (self.url(), call))
1822
1823        return response
1824
1825
1826    def _wait_for_auto_update_finished(self, pid, **kwargs):
1827        """Polling devserver.get_au_status to get current auto-update status.
1828
1829        The current auto-update status is used to identify whether the update
1830        process is finished.
1831
1832        @param pid:    The background process id for auto-update in devserver.
1833        @param kwargs: keyword arguments to make get_au_status devserver call.
1834
1835        @return: True if auto-update is finished for a given dut.
1836        """
1837        logging.debug('Check the progress for auto-update process %r', pid)
1838        kwargs['pid'] = pid
1839        call = self.build_call('get_au_status', **kwargs)
1840
1841        def all_finished():
1842            """Call devserver.get_au_status rpc to check if auto-update
1843               is finished.
1844
1845            @return: True if auto-update is finished for a given dut. False
1846                     otherwise.
1847            @rasies  DevServerException, the exception is a wrapper of all
1848                     exceptions that were raised when devserver tried to
1849                     download the artifacts. devserver raises an HTTPError or
1850                     a CmdError when an exception was raised in the code. Such
1851                     exception should be re-raised here to stop the caller from
1852                     waiting. If the call to devserver failed for connection
1853                     issue, a URLError exception is raised, and caller should
1854                     retry the call to avoid such network flakiness.
1855
1856            """
1857            try:
1858                au_status = self.run_call(call)
1859                response = json.loads(au_status)
1860                # This is a temp fix to fit both dict and tuple returning
1861                # values. The dict check will be removed after a corresponding
1862                # devserver CL is deployed.
1863                if isinstance(response, dict):
1864                    if response.get('detailed_error_msg'):
1865                        raise DevServerException(
1866                                response.get('detailed_error_msg'))
1867
1868                    if response.get('finished'):
1869                        logging.debug('CrOS auto-update is finished')
1870                        return True
1871                    else:
1872                        logging.debug('Current CrOS auto-update status: %s',
1873                                      response.get('status'))
1874                        return False
1875
1876                if not response[0]:
1877                    logging.debug('Current CrOS auto-update status: %s',
1878                                  response[1])
1879                    return False
1880                else:
1881                    logging.debug('CrOS auto-update is finished')
1882                    return True
1883            except urllib2.HTTPError as e:
1884                error_markup = e.read()
1885                raise DevServerException(_strip_http_message(error_markup))
1886            except urllib2.URLError as e:
1887                # Could be connection issue, retry it.
1888                # For example: <urlopen error [Errno 111] Connection refused>
1889                logging.warning('URLError (%r): Retrying connection to '
1890                                'devserver to check auto-update status.', e)
1891                return False
1892            except error.CmdError:
1893                # Retry if SSH failed to connect to the devserver.
1894                logging.warning('CmdError: Retrying SSH connection to check '
1895                                'auto-update status.')
1896                return False
1897            except socket.error as e:
1898                # Could be some temporary devserver connection issues.
1899                logging.warning('Socket Error (%r): Retrying connection to '
1900                                'devserver to check auto-update status.', e)
1901                return False
1902            except ValueError as e:
1903                raise DevServerException(
1904                        '%s (Got AU status: %r)' % (str(e), au_status))
1905
1906        bin_utils.poll_for_condition(
1907                all_finished,
1908                exception=bin_utils.TimeoutError(),
1909                timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60,
1910                sleep_interval=CROS_AU_POLLING_INTERVAL)
1911
1912        return True
1913
1914
1915    def wait_for_auto_update_finished(self, response, **kwargs):
1916        """Processing response of 'cros_au' and polling for auto-update status.
1917
1918        Will wait for the whole auto-update process is finished.
1919
1920        @param response: The response from RPC 'cros_au'
1921        @param kwargs: keyword arguments to make get_au_status devserver call.
1922
1923        @return: a tuple includes two elements.
1924          raised_error: None if everything works well or the raised error.
1925          pid: the auto-update process id on devserver.
1926        """
1927
1928        pid = 0
1929        raised_error = None
1930        try:
1931            response = json.loads(response)
1932            if response[0]:
1933                pid = response[1]
1934                logging.debug('start process %r for auto_update in devserver',
1935                              pid)
1936                self._wait_for_auto_update_finished(pid, **kwargs)
1937        except Exception as e:
1938            logging.debug('Failed to trigger auto-update process on devserver')
1939            raised_error = e
1940        finally:
1941            return raised_error, pid
1942
1943
1944    def _parse_AU_error(self, response):
1945        """Parse auto_update error returned from devserver."""
1946        return re.split('\n', response)[-1]
1947
1948
1949    def _classify_exceptions(self, error_list):
1950        """Parse the error that was raised from auto_update.
1951
1952        @param error_list: The list of errors (string) happened in auto-update
1953
1954        @return: A classified exception type (string) from _EXCEPTION_PATTERNS
1955          or 'Unknown exception'. Current patterns in _EXCEPTION_PATTERNS are
1956          very specific so that errors cannot match more than one pattern.
1957        """
1958        raised_error = ''
1959        if not error_list:
1960            return raised_error
1961        else:
1962            target_error = error_list[0]
1963
1964        for err_pattern, classification in _EXCEPTION_PATTERNS:
1965            match = re.match(err_pattern, target_error)
1966            if match:
1967                return classification
1968
1969        return '(0) Unknown exception'
1970
1971
1972    def _check_error_message(self, error_patterns_to_check, error_msg):
1973        """Detect whether specific error pattern exist in error message.
1974
1975        @param error_patterns_to_check: the error patterns to check
1976        @param error_msg: the error message which may include any error
1977                          pattern.
1978
1979        @return A boolean variable, True if error_msg contains any error
1980            pattern in error_patterns_to_check, False otherwise.
1981        """
1982        for err in error_patterns_to_check:
1983            if err in error_msg:
1984                return True
1985
1986        return False
1987
1988
1989    def _is_retryable(self, error_msg):
1990        """Detect whether we will retry auto-update based on error_msg.
1991
1992        @param error_msg: The given error message.
1993
1994        @return A boolean variable which indicates whether we will retry
1995            auto_update with another devserver based on the given error_msg.
1996        """
1997        # For now we just hard-code the error message we think it's suspicious.
1998        # When we get more date about what's the json response when devserver
1999        # is overloaded, we can update this part.
2000        retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE,
2001                                    'is not pingable']
2002        return self._check_error_message(retryable_error_patterns, error_msg)
2003
2004
2005    def _should_use_original_payload(self, error_msg):
2006        devserver_error_patterns = ['DevserverCannotStartError']
2007        return self._check_error_message(devserver_error_patterns, error_msg)
2008
2009
2010    def _parse_buildname_safely(self, build_name):
2011        """Parse a given buildname safely.
2012
2013        @param build_name: the build name to be parsed.
2014
2015        @return: a tuple (board, build_type, milestone)
2016        """
2017        try:
2018            board, build_type, milestone, _ = server_utils.ParseBuildName(
2019                    build_name)
2020        except server_utils.ParseBuildNameException:
2021            logging.warning('Unable to parse build name %s for metrics. '
2022                            'Continuing anyway.', build_name)
2023            board, build_type, milestone = ('', '', '')
2024
2025        return board, build_type, milestone
2026
2027
2028    def auto_update(self, host_name, build_name, original_board=None,
2029                    original_release_version=None, log_dir=None,
2030                    force_update=False, full_update=False,
2031                    payload_filename=None, force_original=False,
2032                    clobber_stateful=True):
2033        """Auto-update a CrOS host.
2034
2035        @param host_name: The hostname of the DUT to auto-update.
2036        @param build_name:  The build name to be auto-updated on the DUT.
2037        @param original_board: The original board of the DUT to auto-update.
2038        @param original_release_version: The release version of the DUT's
2039            current build.
2040        @param log_dir: The log directory to store auto-update logs from
2041            devserver.
2042        @param force_update: Force an update even if the version installed
2043                             is the same. Default: False.
2044        @param full_update:  If True, do not run stateful update, directly
2045                             force a full reimage. If False, try stateful
2046                             update first if the dut is already installed
2047                             with the same version.
2048        @param payload_filename: Used to specify the exact file to
2049                                 use for autoupdating. If None, the payload
2050                                 will be determined by build_name. You
2051                                 must have already staged this file before
2052                                 passing it in here.
2053        @param force_original: Whether to force stateful update with the
2054                               original payload.
2055        @param clobber_stateful: If True do a clean install of stateful.
2056
2057        @return A set (is_success, pid) in which:
2058            1. is_success indicates whether this auto_update succeeds.
2059            2. pid is the process id of the successful autoupdate run.
2060
2061        @raise DevServerException if auto_update fails and is not retryable.
2062        @raise RetryableProvisionException if it fails and is retryable.
2063        """
2064        kwargs = {'host_name': host_name,
2065                  'build_name': build_name,
2066                  'force_update': force_update,
2067                  'full_update': full_update,
2068                  'clobber_stateful': clobber_stateful}
2069
2070        if payload_filename is not None:
2071            kwargs['payload_filename'] = payload_filename
2072
2073        error_msg = 'CrOS auto-update failed for host %s: %s'
2074        error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s'
2075        is_au_success = False
2076        au_log_dir = os.path.join(log_dir,
2077                                  AUTO_UPDATE_LOG_DIR) if log_dir else None
2078        error_list = []
2079        retry_with_another_devserver = False
2080        board, build_type, milestone = self._parse_buildname_safely(build_name)
2081
2082        for au_attempt in range(AU_RETRY_LIMIT):
2083            logging.debug('Start CrOS auto-update for host %s at %d time(s).',
2084                          host_name, au_attempt + 1)
2085            # No matter _trigger_auto_update succeeds or fails, the auto-update
2086            # track_status_file should be cleaned, and the auto-update execute
2087            # log should be collected to directory sysinfo. Also, the error
2088            # raised by _trigger_auto_update should be displayed.
2089            try:
2090                # Try update with stateful.tgz of old release version in the
2091                # last try of auto-update.
2092                if force_original and original_release_version:
2093                    # Monitor this case in monarch
2094                    original_build = '%s/%s' % (original_board,
2095                                                original_release_version)
2096                    c = metrics.Counter(
2097                            'chromeos/autotest/provision/'
2098                            'cros_update_with_original_build')
2099                    f = {'dev_server': self.resolved_hostname,
2100                         'board': board,
2101                         'build_type': build_type,
2102                         'milestone': milestone,
2103                         'original_build': original_build}
2104                    c.increment(fields=f)
2105
2106                    logging.debug('Try updating stateful partition of the '
2107                                  'host with the same version of its current '
2108                                  'rootfs partition: %s', original_build)
2109                    response = self._trigger_auto_update(
2110                            original_build=original_build, **kwargs)
2111                else:
2112                    response = self._trigger_auto_update(**kwargs)
2113            except DevServerException as e:
2114                logging.debug(error_msg_attempt, au_attempt+1, str(e))
2115                error_list.append(str(e))
2116            else:
2117                raised_error, pid = self.wait_for_auto_update_finished(response,
2118                                                                       **kwargs)
2119                # Error happens in _collect_au_log won't be raised. Auto-update
2120                # process will be retried.
2121                if au_log_dir:
2122                    is_collect_success = self.collect_au_log(
2123                            kwargs['host_name'], pid, au_log_dir)
2124                else:
2125                    is_collect_success = True
2126
2127                # Error happens in _clean_track_log won't be raised. Auto-update
2128                # process will be retried.
2129                # TODO(xixuan): Change kwargs['host_name'] back to host_name
2130                # if crbug.com/651974 is fixed: host_name represents the host
2131                # name of the host, and kwargs['host_name'] could be host_name
2132                # or the IP of this host.
2133                is_clean_success = self.clean_track_log(kwargs['host_name'],
2134                                                        pid)
2135                # If any error is raised previously, log it and retry
2136                # auto-update. Otherwise, claim a successful CrOS auto-update.
2137                if not raised_error and is_clean_success and is_collect_success:
2138                    logging.debug('CrOS auto-update succeed for host %s',
2139                                  host_name)
2140                    is_au_success = True
2141                    break
2142                else:
2143                    if not self.kill_au_process_for_host(kwargs['host_name'],
2144                                                         pid):
2145                        logging.debug('Failed to kill auto_update process %d',
2146                                      pid)
2147                    if raised_error:
2148                        logging.debug(error_msg_attempt, au_attempt+1,
2149                                      str(raised_error))
2150                        if au_log_dir:
2151                            logging.debug('Please see error details in log %s',
2152                                          self._get_au_log_filename(
2153                                                  au_log_dir,
2154                                                  kwargs['host_name'],
2155                                                  pid))
2156                        error_list.append(self._parse_AU_error(str(raised_error)))
2157                        if self._is_retryable(str(raised_error)):
2158                            retry_with_another_devserver = True
2159
2160                        if self._should_use_original_payload(str(raised_error)):
2161                            force_original = True
2162
2163            finally:
2164                if retry_with_another_devserver:
2165                    break
2166
2167                if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1:
2168                    time.sleep(CROS_AU_RETRY_INTERVAL)
2169                    # TODO(kevcheng): Remove this once crbug.com/651974 is
2170                    # fixed.
2171                    # DNS is broken in the cassandra lab, so use the IP of the
2172                    # hostname instead if it fails. Not rename host_name here
2173                    # for error msg reporting.
2174                    host_name_ip = socket.gethostbyname(host_name)
2175                    kwargs['host_name'] = host_name_ip
2176                    logging.debug(
2177                            'AU failed, trying IP instead of hostname: %s',
2178                            host_name_ip)
2179
2180        # Note: To avoid reaching or exceeding the monarch field cardinality
2181        # limit, we avoid a metric that includes both dut hostname and other
2182        # high cardinality fields.
2183        # Per-devserver cros_update metric.
2184        c = metrics.Counter(
2185                'chromeos/autotest/provision/cros_update_by_devserver')
2186        # Add a field |error| here. Current error's pattern is manually
2187        # specified in _EXCEPTION_PATTERNS.
2188        raised_error = self._classify_exceptions(error_list)
2189        f = {'dev_server': self.resolved_hostname,
2190             'success': is_au_success,
2191             'board': board,
2192             'build_type': build_type,
2193             'milestone': milestone,
2194             'error': raised_error}
2195        c.increment(fields=f)
2196
2197        # Per-DUT cros_update metric.
2198        c = metrics.Counter('chromeos/autotest/provision/cros_update_per_dut')
2199        f = {'success': is_au_success,
2200             'board': board,
2201             'error': raised_error,
2202             'dut_host_name': host_name}
2203        c.increment(fields=f)
2204
2205        if is_au_success:
2206            return (is_au_success, pid)
2207
2208        # If errors happen in the CrOS AU process, report the first error
2209        # since the following errors might be caused by the first error.
2210        # If error happens in RPCs of cleaning track log, collecting
2211        # auto-update logs, or killing auto-update processes, just report
2212        # them together.
2213        if error_list:
2214            if retry_with_another_devserver:
2215                raise RetryableProvisionException(
2216                        error_msg % (host_name, error_list[0]))
2217            else:
2218                raise DevServerException(
2219                        error_msg % (host_name, error_list[0]))
2220        else:
2221            raise DevServerException(error_msg % (
2222                        host_name, ('RPC calls after the whole auto-update '
2223                                    'process failed.')))
2224
2225
2226class AndroidBuildServer(ImageServerBase):
2227    """Class for DevServer that handles RPCs related to Android builds.
2228
2229    The calls to devserver to stage artifacts, including stage and download, are
2230    made in async mode. That is, when caller makes an RPC |stage| to request
2231    devserver to stage certain artifacts, devserver handles the call and starts
2232    staging artifacts in a new thread, and return |Success| without waiting for
2233    staging being completed. When caller receives message |Success|, it polls
2234    devserver's is_staged call until all artifacts are staged.
2235    Such mechanism is designed to prevent cherrypy threads in devserver being
2236    running out, as staging artifacts might take long time, and cherrypy starts
2237    with a fixed number of threads that handle devserver rpc.
2238    """
2239
2240    def wait_for_artifacts_staged(self, target, build_id, branch,
2241                                  archive_url=None, artifacts='', files=''):
2242        """Polling devserver.is_staged until all artifacts are staged.
2243
2244        @param target: Target of the android build to stage, e.g.,
2245                       shamu-userdebug.
2246        @param build_id: Build id of the android build to stage.
2247        @param branch: Branch of the android build to stage.
2248        @param archive_url: Google Storage URL for the build.
2249        @param artifacts: Comma separated list of artifacts to download.
2250        @param files: Comma separated list of files to download.
2251
2252        @return: True if all artifacts are staged in devserver.
2253        """
2254        kwargs = {'target': target,
2255                  'build_id': build_id,
2256                  'branch': branch,
2257                  'artifacts': artifacts,
2258                  'files': files,
2259                  'os_type': 'android'}
2260        if archive_url:
2261            kwargs['archive_url'] = archive_url
2262        return self._poll_is_staged(**kwargs)
2263
2264
2265    @remote_devserver_call()
2266    def call_and_wait(self, call_name, target, build_id, branch, archive_url,
2267                      artifacts, files, error_message,
2268                      expected_response=SUCCESS):
2269        """Helper method to make a urlopen call, and wait for artifacts staged.
2270
2271        @param call_name: name of devserver rpc call.
2272        @param target: Target of the android build to stage, e.g.,
2273                       shamu-userdebug.
2274        @param build_id: Build id of the android build to stage.
2275        @param branch: Branch of the android build to stage.
2276        @param archive_url: Google Storage URL for the CrOS build.
2277        @param artifacts: Comma separated list of artifacts to download.
2278        @param files: Comma separated list of files to download.
2279        @param expected_response: Expected response from rpc, default to
2280                                  |Success|. If it's set to None, do not compare
2281                                  the actual response. Any response is consider
2282                                  to be good.
2283        @param error_message: Error message to be thrown if response does not
2284                              match expected_response.
2285
2286        @return: The response from rpc.
2287        @raise DevServerException upon any return code that's expected_response.
2288
2289        """
2290        kwargs = {'target': target,
2291                  'build_id': build_id,
2292                  'branch': branch,
2293                  'artifacts': artifacts,
2294                  'files': files,
2295                  'os_type': 'android'}
2296        if archive_url:
2297            kwargs['archive_url'] = archive_url
2298        return self._call_and_wait(call_name, error_message, expected_response,
2299                                   **kwargs)
2300
2301
2302    @remote_devserver_call()
2303    def stage_artifacts(self, target=None, build_id=None, branch=None,
2304                        image=None, artifacts=None, files='', archive_url=None):
2305        """Tell the devserver to download and stage |artifacts| from |image|.
2306
2307         This is the main call point for staging any specific artifacts for a
2308        given build. To see the list of artifacts one can stage see:
2309
2310        ~src/platfrom/dev/artifact_info.py.
2311
2312        This is maintained along with the actual devserver code.
2313
2314        @param target: Target of the android build to stage, e.g.,
2315                               shamu-userdebug.
2316        @param build_id: Build id of the android build to stage.
2317        @param branch: Branch of the android build to stage.
2318        @param image: Name of a build to test, in the format of
2319                      branch/target/build_id
2320        @param artifacts: A list of artifacts.
2321        @param files: A list of files to stage.
2322        @param archive_url: Optional parameter that has the archive_url to stage
2323                this artifact from. Default is specified in autotest config +
2324                image.
2325
2326        @raise DevServerException upon any return code that's not HTTP OK.
2327        """
2328        if image and not target and not build_id and not branch:
2329            branch, target, build_id = utils.parse_launch_control_build(image)
2330        if not target or not build_id or not branch:
2331            raise DevServerException('Must specify all build info (target, '
2332                                     'build_id and branch) to stage.')
2333
2334        android_build_info = {'target': target,
2335                              'build_id': build_id,
2336                              'branch': branch}
2337        if not artifacts and not files:
2338            raise DevServerException('Must specify something to stage.')
2339        if not all(android_build_info.values()):
2340            raise DevServerException(
2341                    'To stage an Android build, must specify target, build id '
2342                    'and branch.')
2343        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2344        self._stage_artifacts(build, artifacts, files, archive_url,
2345                              **android_build_info)
2346
2347    def get_pull_url(self, target, build_id, branch):
2348        """Get the url to pull files from the devserver.
2349
2350        @param target: Target of the android build, e.g., shamu_userdebug
2351        @param build_id: Build id of the android build.
2352        @param branch: Branch of the android build.
2353
2354        @return A url to pull files from the dev server given a specific
2355                android build.
2356        """
2357        return os.path.join(self.url(), 'static', branch, target, build_id)
2358
2359
2360    def trigger_download(self, target, build_id, branch, artifacts=None,
2361                         files='', os='android', synchronous=True):
2362        """Tell the devserver to download and stage an Android build.
2363
2364        Tells the devserver to fetch an Android build from the image storage
2365        server named by _get_image_storage_server().
2366
2367        If |synchronous| is True, waits for the entire download to finish
2368        staging before returning. Otherwise only the artifacts necessary
2369        to start installing images onto DUT's will be staged before returning.
2370        A caller can then call finish_download to guarantee the rest of the
2371        artifacts have finished staging.
2372
2373        @param target: Target of the android build to stage, e.g.,
2374                       shamu-userdebug.
2375        @param build_id: Build id of the android build to stage.
2376        @param branch: Branch of the android build to stage.
2377        @param artifacts: A string of artifacts separated by comma. If None,
2378               use the default artifacts for Android or Brillo build.
2379        @param files: String of file seperated by commas.
2380        @param os: OS artifacts to download (android/brillo).
2381        @param synchronous: if True, waits until all components of the image are
2382               staged before returning.
2383
2384        @raise DevServerException upon any return code that's not HTTP OK.
2385
2386        """
2387        android_build_info = {'target': target,
2388                              'build_id': build_id,
2389                              'branch': branch}
2390        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2391        if not artifacts:
2392            board = target.split('-')[0]
2393            artifacts = (
2394                android_utils.AndroidArtifacts.get_artifacts_for_reimage(
2395                        board, os))
2396        self._trigger_download(build, artifacts, files=files,
2397                               synchronous=synchronous, **android_build_info)
2398
2399
2400    def finish_download(self, target, build_id, branch, os='android'):
2401        """Tell the devserver to finish staging an Android build.
2402
2403        If trigger_download is called with synchronous=False, it will return
2404        before all artifacts have been staged. This method contacts the
2405        devserver and blocks until all staging is completed and should be
2406        called after a call to trigger_download.
2407
2408        @param target: Target of the android build to stage, e.g.,
2409                       shamu-userdebug.
2410        @param build_id: Build id of the android build to stage.
2411        @param branch: Branch of the android build to stage.
2412        @param os: OS artifacts to download (android/brillo).
2413
2414        @raise DevServerException upon any return code that's not HTTP OK.
2415        """
2416        android_build_info = {'target': target,
2417                              'build_id': build_id,
2418                              'branch': branch}
2419        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2420        board = target.split('-')[0]
2421        artifacts = (
2422                android_utils.AndroidArtifacts.get_artifacts_for_reimage(
2423                        board))
2424        self._finish_download(build, artifacts, files='', **android_build_info)
2425
2426
2427    def get_staged_file_url(self, filename, target, build_id, branch):
2428        """Returns the url of a staged file for this image on the devserver.
2429
2430        @param filename: Name of the file.
2431        @param target: Target of the android build to stage, e.g.,
2432                       shamu-userdebug.
2433        @param build_id: Build id of the android build to stage.
2434        @param branch: Branch of the android build to stage.
2435
2436        @return: The url of a staged file for this image on the devserver.
2437        """
2438        android_build_info = {'target': target,
2439                              'build_id': build_id,
2440                              'branch': branch,
2441                              'os_type': 'android'}
2442        build = ANDROID_BUILD_NAME_PATTERN % android_build_info
2443        return '/'.join([self._get_image_url(build), filename])
2444
2445
2446    @remote_devserver_call()
2447    def translate(self, build_name):
2448        """Translate the build name if it's in LATEST format.
2449
2450        If the build name is in the format [branch]/[target]/LATEST, return the
2451        latest build in Launch Control otherwise return the build name as is.
2452
2453        @param build_name: build_name to check.
2454
2455        @return The actual build name to use.
2456        """
2457        branch, target, build_id = utils.parse_launch_control_build(build_name)
2458        if build_id.upper() != 'LATEST':
2459            return build_name
2460        call = self.build_call('latestbuild', branch=branch, target=target,
2461                               os_type='android')
2462        translated_build_id = self.run_call(call)
2463        translated_build = (ANDROID_BUILD_NAME_PATTERN %
2464                            {'branch': branch,
2465                             'target': target,
2466                             'build_id': translated_build_id})
2467        logging.debug('Translated relative build %s to %s', build_name,
2468                      translated_build)
2469        return translated_build
2470
2471
2472def _is_load_healthy(load):
2473    """Check if devserver's load meets the minimum threshold.
2474
2475    @param load: The devserver's load stats to check.
2476
2477    @return: True if the load meets the minimum threshold. Return False
2478             otherwise.
2479
2480    """
2481    # Threshold checks, including CPU load.
2482    if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD:
2483        logging.debug('CPU load of devserver %s is at %s%%, which is higher '
2484                      'than the threshold of %s%%', load['devserver'],
2485                      load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD)
2486        return False
2487    if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO:
2488        logging.debug('Network IO of devserver %s is at %i Bps, which is '
2489                      'higher than the threshold of %i bytes per second.',
2490                      load['devserver'], load[DevServer.NETWORK_IO],
2491                      DevServer.MAX_NETWORK_IO)
2492        return False
2493    return True
2494
2495
2496def _compare_load(devserver1, devserver2):
2497    """Comparator function to compare load between two devservers.
2498
2499    @param devserver1: A dictionary of devserver load stats to be compared.
2500    @param devserver2: A dictionary of devserver load stats to be compared.
2501
2502    @return: Negative value if the load of `devserver1` is less than the load
2503             of `devserver2`. Return positive value otherwise.
2504
2505    """
2506    return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO])
2507
2508
2509def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None):
2510    """Get the devserver with the least load.
2511
2512    Iterate through all devservers and get the one with least load.
2513
2514    TODO(crbug.com/486278): Devserver with required build already staged should
2515    take higher priority. This will need check_health call to be able to verify
2516    existence of a given build/artifact. Also, in case all devservers are
2517    overloaded, the logic here should fall back to the old behavior that randomly
2518    selects a devserver based on the hash of the image name/url.
2519
2520    @param devserver_type: Type of devserver to select from. Default is set to
2521                           ImageServer.
2522    @param hostname: Hostname of the dut that the devserver is used for. The
2523            picked devserver needs to respect the location of the host if
2524            `prefer_local_devserver` is set to True or `restricted_subnets` is
2525            set.
2526
2527    @return: Name of the devserver with the least load.
2528
2529    """
2530    logging.debug('Get the least loaded %r', devserver_type)
2531    devservers, can_retry = devserver_type.get_available_devservers(
2532            hostname)
2533    # If no healthy devservers available and can_retry is False, return None.
2534    # Otherwise, relax the constrain on hostname, allow all devservers to be
2535    # available.
2536    if not devserver_type.get_healthy_devserver('', devservers):
2537        if not can_retry:
2538            return None
2539        else:
2540            devservers, _ = devserver_type.get_available_devservers()
2541
2542    # get_devserver_load call needs to be made in a new process to allow force
2543    # timeout using signal.
2544    output = multiprocessing.Queue()
2545    processes = []
2546    for devserver in devservers:
2547        processes.append(multiprocessing.Process(
2548                target=devserver_type.get_devserver_load_wrapper,
2549                args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output)))
2550
2551    for p in processes:
2552        p.start()
2553    for p in processes:
2554        p.join()
2555    loads = [output.get() for p in processes]
2556    # Filter out any load failed to be retrieved or does not support load check.
2557    loads = [load for load in loads if load and DevServer.CPU_LOAD in load and
2558             DevServer.is_free_disk_ok(load) and
2559             DevServer.is_apache_client_count_ok(load)]
2560    if not loads:
2561        logging.debug('Failed to retrieve load stats from any devserver. No '
2562                      'load balancing can be applied.')
2563        return None
2564    loads = [load for load in loads if _is_load_healthy(load)]
2565    if not loads:
2566        logging.error('No devserver has the capacity to be selected.')
2567        return None
2568    loads = sorted(loads, cmp=_compare_load)
2569    return loads[0]['devserver']
2570
2571
2572def resolve(build, hostname=None, ban_list=None):
2573    """Resolve a devserver can be used for given build and hostname.
2574
2575    @param build: Name of a build to stage on devserver, e.g.,
2576                  ChromeOS build: daisy-release/R50-1234.0.0
2577                  Launch Control build: git_mnc_release/shamu-eng
2578    @param hostname: Hostname of a devserver for, default is None, which means
2579            devserver is not restricted by the network location of the host.
2580    @param ban_list: The blacklist of devservers shouldn't be chosen.
2581
2582    @return: A DevServer instance that can be used to stage given build for the
2583             given host.
2584    """
2585    if utils.is_launch_control_build(build):
2586        return AndroidBuildServer.resolve(build, hostname)
2587    else:
2588        return ImageServer.resolve(build, hostname, ban_list=ban_list)
2589