• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import types
33import collections
34import warnings
35
36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
37           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
38           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
39           "unquote", "unquote_plus", "unquote_to_bytes",
40           "DefragResult", "ParseResult", "SplitResult",
41           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
42
43# A classification of schemes.
44# The empty string classifies URLs with no scheme specified,
45# being the default value returned by “urlsplit” and “urlparse”.
46
47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
48                 'wais', 'file', 'https', 'shttp', 'mms',
49                 'prospero', 'rtsp', 'rtspu', 'sftp',
50                 'svn', 'svn+ssh', 'ws', 'wss']
51
52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
53               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
54               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
55               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
56               'ws', 'wss']
57
58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
59               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
60               'mms', 'sftp', 'tel']
61
62# These are not actually used anymore, but should stay for backwards
63# compatibility.  (They are undocumented, but have a public-looking name.)
64
65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
66                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
67
68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
69              'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
70
71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
72                 'nntp', 'wais', 'https', 'shttp', 'snews',
73                 'file', 'prospero']
74
75# Characters valid in scheme names
76scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
77                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
78                '0123456789'
79                '+-.')
80
81# Unsafe bytes to be removed per WHATWG spec
82_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
83
84# XXX: Consider replacing with functools.lru_cache
85MAX_CACHE_SIZE = 20
86_parse_cache = {}
87
88def clear_cache():
89    """Clear the parse cache and the quoters cache."""
90    _parse_cache.clear()
91    _safe_quoters.clear()
92
93
94# Helpers for bytes handling
95# For 3.2, we deliberately require applications that
96# handle improperly quoted URLs to do their own
97# decoding and encoding. If valid use cases are
98# presented, we may relax this by using latin-1
99# decoding internally for 3.3
100_implicit_encoding = 'ascii'
101_implicit_errors = 'strict'
102
103def _noop(obj):
104    return obj
105
106def _encode_result(obj, encoding=_implicit_encoding,
107                        errors=_implicit_errors):
108    return obj.encode(encoding, errors)
109
110def _decode_args(args, encoding=_implicit_encoding,
111                       errors=_implicit_errors):
112    return tuple(x.decode(encoding, errors) if x else '' for x in args)
113
114def _coerce_args(*args):
115    # Invokes decode if necessary to create str args
116    # and returns the coerced inputs along with
117    # an appropriate result coercion function
118    #   - noop for str inputs
119    #   - encoding function otherwise
120    str_input = isinstance(args[0], str)
121    for arg in args[1:]:
122        # We special-case the empty string to support the
123        # "scheme=''" default argument to some functions
124        if arg and isinstance(arg, str) != str_input:
125            raise TypeError("Cannot mix str and non-str arguments")
126    if str_input:
127        return args + (_noop,)
128    return _decode_args(args) + (_encode_result,)
129
130# Result objects are more helpful than simple tuples
131class _ResultMixinStr(object):
132    """Standard approach to encoding parsed results from str to bytes"""
133    __slots__ = ()
134
135    def encode(self, encoding='ascii', errors='strict'):
136        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
137
138
139class _ResultMixinBytes(object):
140    """Standard approach to decoding parsed results from bytes to str"""
141    __slots__ = ()
142
143    def decode(self, encoding='ascii', errors='strict'):
144        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
145
146
147class _NetlocResultMixinBase(object):
148    """Shared methods for the parsed result objects containing a netloc element"""
149    __slots__ = ()
150
151    @property
152    def username(self):
153        return self._userinfo[0]
154
155    @property
156    def password(self):
157        return self._userinfo[1]
158
159    @property
160    def hostname(self):
161        hostname = self._hostinfo[0]
162        if not hostname:
163            return None
164        # Scoped IPv6 address may have zone info, which must not be lowercased
165        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
166        separator = '%' if isinstance(hostname, str) else b'%'
167        hostname, percent, zone = hostname.partition(separator)
168        return hostname.lower() + percent + zone
169
170    @property
171    def port(self):
172        port = self._hostinfo[1]
173        if port is not None:
174            try:
175                port = int(port, 10)
176            except ValueError:
177                message = f'Port could not be cast to integer value as {port!r}'
178                raise ValueError(message) from None
179            if not ( 0 <= port <= 65535):
180                raise ValueError("Port out of range 0-65535")
181        return port
182
183    __class_getitem__ = classmethod(types.GenericAlias)
184
185
186class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
187    __slots__ = ()
188
189    @property
190    def _userinfo(self):
191        netloc = self.netloc
192        userinfo, have_info, hostinfo = netloc.rpartition('@')
193        if have_info:
194            username, have_password, password = userinfo.partition(':')
195            if not have_password:
196                password = None
197        else:
198            username = password = None
199        return username, password
200
201    @property
202    def _hostinfo(self):
203        netloc = self.netloc
204        _, _, hostinfo = netloc.rpartition('@')
205        _, have_open_br, bracketed = hostinfo.partition('[')
206        if have_open_br:
207            hostname, _, port = bracketed.partition(']')
208            _, _, port = port.partition(':')
209        else:
210            hostname, _, port = hostinfo.partition(':')
211        if not port:
212            port = None
213        return hostname, port
214
215
216class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
217    __slots__ = ()
218
219    @property
220    def _userinfo(self):
221        netloc = self.netloc
222        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
223        if have_info:
224            username, have_password, password = userinfo.partition(b':')
225            if not have_password:
226                password = None
227        else:
228            username = password = None
229        return username, password
230
231    @property
232    def _hostinfo(self):
233        netloc = self.netloc
234        _, _, hostinfo = netloc.rpartition(b'@')
235        _, have_open_br, bracketed = hostinfo.partition(b'[')
236        if have_open_br:
237            hostname, _, port = bracketed.partition(b']')
238            _, _, port = port.partition(b':')
239        else:
240            hostname, _, port = hostinfo.partition(b':')
241        if not port:
242            port = None
243        return hostname, port
244
245
246from collections import namedtuple
247
248_DefragResultBase = namedtuple('DefragResult', 'url fragment')
249_SplitResultBase = namedtuple(
250    'SplitResult', 'scheme netloc path query fragment')
251_ParseResultBase = namedtuple(
252    'ParseResult', 'scheme netloc path params query fragment')
253
254_DefragResultBase.__doc__ = """
255DefragResult(url, fragment)
256
257A 2-tuple that contains the url without fragment identifier and the fragment
258identifier as a separate argument.
259"""
260
261_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
262
263_DefragResultBase.fragment.__doc__ = """
264Fragment identifier separated from URL, that allows indirect identification of a
265secondary resource by reference to a primary resource and additional identifying
266information.
267"""
268
269_SplitResultBase.__doc__ = """
270SplitResult(scheme, netloc, path, query, fragment)
271
272A 5-tuple that contains the different components of a URL. Similar to
273ParseResult, but does not split params.
274"""
275
276_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
277
278_SplitResultBase.netloc.__doc__ = """
279Network location where the request is made to.
280"""
281
282_SplitResultBase.path.__doc__ = """
283The hierarchical path, such as the path to a file to download.
284"""
285
286_SplitResultBase.query.__doc__ = """
287The query component, that contains non-hierarchical data, that along with data
288in path component, identifies a resource in the scope of URI's scheme and
289network location.
290"""
291
292_SplitResultBase.fragment.__doc__ = """
293Fragment identifier, that allows indirect identification of a secondary resource
294by reference to a primary resource and additional identifying information.
295"""
296
297_ParseResultBase.__doc__ = """
298ParseResult(scheme, netloc, path, params, query, fragment)
299
300A 6-tuple that contains components of a parsed URL.
301"""
302
303_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
304_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
305_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
306_ParseResultBase.params.__doc__ = """
307Parameters for last path element used to dereference the URI in order to provide
308access to perform some operation on the resource.
309"""
310
311_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
312_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
313
314
315# For backwards compatibility, alias _NetlocResultMixinStr
316# ResultBase is no longer part of the documented API, but it is
317# retained since deprecating it isn't worth the hassle
318ResultBase = _NetlocResultMixinStr
319
320# Structured result objects for string data
321class DefragResult(_DefragResultBase, _ResultMixinStr):
322    __slots__ = ()
323    def geturl(self):
324        if self.fragment:
325            return self.url + '#' + self.fragment
326        else:
327            return self.url
328
329class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
330    __slots__ = ()
331    def geturl(self):
332        return urlunsplit(self)
333
334class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
335    __slots__ = ()
336    def geturl(self):
337        return urlunparse(self)
338
339# Structured result objects for bytes data
340class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
341    __slots__ = ()
342    def geturl(self):
343        if self.fragment:
344            return self.url + b'#' + self.fragment
345        else:
346            return self.url
347
348class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
349    __slots__ = ()
350    def geturl(self):
351        return urlunsplit(self)
352
353class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
354    __slots__ = ()
355    def geturl(self):
356        return urlunparse(self)
357
358# Set up the encode/decode result pairs
359def _fix_result_transcoding():
360    _result_pairs = (
361        (DefragResult, DefragResultBytes),
362        (SplitResult, SplitResultBytes),
363        (ParseResult, ParseResultBytes),
364    )
365    for _decoded, _encoded in _result_pairs:
366        _decoded._encoded_counterpart = _encoded
367        _encoded._decoded_counterpart = _decoded
368
369_fix_result_transcoding()
370del _fix_result_transcoding
371
372def urlparse(url, scheme='', allow_fragments=True):
373    """Parse a URL into 6 components:
374    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
375
376    The result is a named 6-tuple with fields corresponding to the
377    above. It is either a ParseResult or ParseResultBytes object,
378    depending on the type of the url parameter.
379
380    The username, password, hostname, and port sub-components of netloc
381    can also be accessed as attributes of the returned object.
382
383    The scheme argument provides the default value of the scheme
384    component when no scheme is found in url.
385
386    If allow_fragments is False, no attempt is made to separate the
387    fragment component from the previous component, which can be either
388    path or query.
389
390    Note that % escapes are not expanded.
391    """
392    url, scheme, _coerce_result = _coerce_args(url, scheme)
393    splitresult = urlsplit(url, scheme, allow_fragments)
394    scheme, netloc, url, query, fragment = splitresult
395    if scheme in uses_params and ';' in url:
396        url, params = _splitparams(url)
397    else:
398        params = ''
399    result = ParseResult(scheme, netloc, url, params, query, fragment)
400    return _coerce_result(result)
401
402def _splitparams(url):
403    if '/'  in url:
404        i = url.find(';', url.rfind('/'))
405        if i < 0:
406            return url, ''
407    else:
408        i = url.find(';')
409    return url[:i], url[i+1:]
410
411def _splitnetloc(url, start=0):
412    delim = len(url)   # position of end of domain part of url, default is end
413    for c in '/?#':    # look for delimiters; the order is NOT important
414        wdelim = url.find(c, start)        # find first of this delim
415        if wdelim >= 0:                    # if found
416            delim = min(delim, wdelim)     # use earliest delim position
417    return url[start:delim], url[delim:]   # return (domain, rest)
418
419def _checknetloc(netloc):
420    if not netloc or netloc.isascii():
421        return
422    # looking for characters like \u2100 that expand to 'a/c'
423    # IDNA uses NFKC equivalence, so normalize for this check
424    import unicodedata
425    n = netloc.replace('@', '')   # ignore characters already included
426    n = n.replace(':', '')        # but not the surrounding text
427    n = n.replace('#', '')
428    n = n.replace('?', '')
429    netloc2 = unicodedata.normalize('NFKC', n)
430    if n == netloc2:
431        return
432    for c in '/?#@:':
433        if c in netloc2:
434            raise ValueError("netloc '" + netloc + "' contains invalid " +
435                             "characters under NFKC normalization")
436
437def urlsplit(url, scheme='', allow_fragments=True):
438    """Parse a URL into 5 components:
439    <scheme>://<netloc>/<path>?<query>#<fragment>
440
441    The result is a named 5-tuple with fields corresponding to the
442    above. It is either a SplitResult or SplitResultBytes object,
443    depending on the type of the url parameter.
444
445    The username, password, hostname, and port sub-components of netloc
446    can also be accessed as attributes of the returned object.
447
448    The scheme argument provides the default value of the scheme
449    component when no scheme is found in url.
450
451    If allow_fragments is False, no attempt is made to separate the
452    fragment component from the previous component, which can be either
453    path or query.
454
455    Note that % escapes are not expanded.
456    """
457
458    url, scheme, _coerce_result = _coerce_args(url, scheme)
459    allow_fragments = bool(allow_fragments)
460    key = url, scheme, allow_fragments, type(url), type(scheme)
461    cached = _parse_cache.get(key, None)
462    if cached:
463        return _coerce_result(cached)
464    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
465        clear_cache()
466    netloc = query = fragment = ''
467    i = url.find(':')
468    if i > 0 and url[0].isascii() and url[0].isalpha():
469        for c in url[:i]:
470            if c not in scheme_chars:
471                break
472        else:
473            scheme, url = url[:i].lower(), url[i+1:]
474
475    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
476        url = url.replace(b, "")
477
478    if url[:2] == '//':
479        netloc, url = _splitnetloc(url, 2)
480        if (('[' in netloc and ']' not in netloc) or
481                (']' in netloc and '[' not in netloc)):
482            raise ValueError("Invalid IPv6 URL")
483    if allow_fragments and '#' in url:
484        url, fragment = url.split('#', 1)
485    if '?' in url:
486        url, query = url.split('?', 1)
487    _checknetloc(netloc)
488    v = SplitResult(scheme, netloc, url, query, fragment)
489    _parse_cache[key] = v
490    return _coerce_result(v)
491
492def urlunparse(components):
493    """Put a parsed URL back together again.  This may result in a
494    slightly different, but equivalent URL, if the URL that was parsed
495    originally had redundant delimiters, e.g. a ? with an empty query
496    (the draft states that these are equivalent)."""
497    scheme, netloc, url, params, query, fragment, _coerce_result = (
498                                                  _coerce_args(*components))
499    if params:
500        url = "%s;%s" % (url, params)
501    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
502
503def urlunsplit(components):
504    """Combine the elements of a tuple as returned by urlsplit() into a
505    complete URL as a string. The data argument can be any five-item iterable.
506    This may result in a slightly different, but equivalent URL, if the URL that
507    was parsed originally had unnecessary delimiters (for example, a ? with an
508    empty query; the RFC states that these are equivalent)."""
509    scheme, netloc, url, query, fragment, _coerce_result = (
510                                          _coerce_args(*components))
511    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
512        if url and url[:1] != '/': url = '/' + url
513        url = '//' + (netloc or '') + url
514    if scheme:
515        url = scheme + ':' + url
516    if query:
517        url = url + '?' + query
518    if fragment:
519        url = url + '#' + fragment
520    return _coerce_result(url)
521
522def urljoin(base, url, allow_fragments=True):
523    """Join a base URL and a possibly relative URL to form an absolute
524    interpretation of the latter."""
525    if not base:
526        return url
527    if not url:
528        return base
529
530    base, url, _coerce_result = _coerce_args(base, url)
531    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
532            urlparse(base, '', allow_fragments)
533    scheme, netloc, path, params, query, fragment = \
534            urlparse(url, bscheme, allow_fragments)
535
536    if scheme != bscheme or scheme not in uses_relative:
537        return _coerce_result(url)
538    if scheme in uses_netloc:
539        if netloc:
540            return _coerce_result(urlunparse((scheme, netloc, path,
541                                              params, query, fragment)))
542        netloc = bnetloc
543
544    if not path and not params:
545        path = bpath
546        params = bparams
547        if not query:
548            query = bquery
549        return _coerce_result(urlunparse((scheme, netloc, path,
550                                          params, query, fragment)))
551
552    base_parts = bpath.split('/')
553    if base_parts[-1] != '':
554        # the last item is not a directory, so will not be taken into account
555        # in resolving the relative path
556        del base_parts[-1]
557
558    # for rfc3986, ignore all base path should the first character be root.
559    if path[:1] == '/':
560        segments = path.split('/')
561    else:
562        segments = base_parts + path.split('/')
563        # filter out elements that would cause redundant slashes on re-joining
564        # the resolved_path
565        segments[1:-1] = filter(None, segments[1:-1])
566
567    resolved_path = []
568
569    for seg in segments:
570        if seg == '..':
571            try:
572                resolved_path.pop()
573            except IndexError:
574                # ignore any .. segments that would otherwise cause an IndexError
575                # when popped from resolved_path if resolving for rfc3986
576                pass
577        elif seg == '.':
578            continue
579        else:
580            resolved_path.append(seg)
581
582    if segments[-1] in ('.', '..'):
583        # do some post-processing here. if the last segment was a relative dir,
584        # then we need to append the trailing '/'
585        resolved_path.append('')
586
587    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
588        resolved_path) or '/', params, query, fragment)))
589
590
591def urldefrag(url):
592    """Removes any existing fragment from URL.
593
594    Returns a tuple of the defragmented URL and the fragment.  If
595    the URL contained no fragments, the second element is the
596    empty string.
597    """
598    url, _coerce_result = _coerce_args(url)
599    if '#' in url:
600        s, n, p, a, q, frag = urlparse(url)
601        defrag = urlunparse((s, n, p, a, q, ''))
602    else:
603        frag = ''
604        defrag = url
605    return _coerce_result(DefragResult(defrag, frag))
606
607_hexdig = '0123456789ABCDEFabcdef'
608_hextobyte = None
609
610def unquote_to_bytes(string):
611    """unquote_to_bytes('abc%20def') -> b'abc def'."""
612    # Note: strings are encoded as UTF-8. This is only an issue if it contains
613    # unescaped non-ASCII characters, which URIs should not.
614    if not string:
615        # Is it a string-like object?
616        string.split
617        return b''
618    if isinstance(string, str):
619        string = string.encode('utf-8')
620    bits = string.split(b'%')
621    if len(bits) == 1:
622        return string
623    res = [bits[0]]
624    append = res.append
625    # Delay the initialization of the table to not waste memory
626    # if the function is never called
627    global _hextobyte
628    if _hextobyte is None:
629        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
630                      for a in _hexdig for b in _hexdig}
631    for item in bits[1:]:
632        try:
633            append(_hextobyte[item[:2]])
634            append(item[2:])
635        except KeyError:
636            append(b'%')
637            append(item)
638    return b''.join(res)
639
640_asciire = re.compile('([\x00-\x7f]+)')
641
642def unquote(string, encoding='utf-8', errors='replace'):
643    """Replace %xx escapes by their single-character equivalent. The optional
644    encoding and errors parameters specify how to decode percent-encoded
645    sequences into Unicode characters, as accepted by the bytes.decode()
646    method.
647    By default, percent-encoded sequences are decoded with UTF-8, and invalid
648    sequences are replaced by a placeholder character.
649
650    unquote('abc%20def') -> 'abc def'.
651    """
652    if isinstance(string, bytes):
653        return unquote_to_bytes(string).decode(encoding, errors)
654    if '%' not in string:
655        string.split
656        return string
657    if encoding is None:
658        encoding = 'utf-8'
659    if errors is None:
660        errors = 'replace'
661    bits = _asciire.split(string)
662    res = [bits[0]]
663    append = res.append
664    for i in range(1, len(bits), 2):
665        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
666        append(bits[i + 1])
667    return ''.join(res)
668
669
670def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
671             encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
672    """Parse a query given as a string argument.
673
674        Arguments:
675
676        qs: percent-encoded query string to be parsed
677
678        keep_blank_values: flag indicating whether blank values in
679            percent-encoded queries should be treated as blank strings.
680            A true value indicates that blanks should be retained as
681            blank strings.  The default false value indicates that
682            blank values are to be ignored and treated as if they were
683            not included.
684
685        strict_parsing: flag indicating what to do with parsing errors.
686            If false (the default), errors are silently ignored.
687            If true, errors raise a ValueError exception.
688
689        encoding and errors: specify how to decode percent-encoded sequences
690            into Unicode characters, as accepted by the bytes.decode() method.
691
692        max_num_fields: int. If set, then throws a ValueError if there
693            are more than n fields read by parse_qsl().
694
695        separator: str. The symbol to use for separating the query arguments.
696            Defaults to &.
697
698        Returns a dictionary.
699    """
700    parsed_result = {}
701    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
702                      encoding=encoding, errors=errors,
703                      max_num_fields=max_num_fields, separator=separator)
704    for name, value in pairs:
705        if name in parsed_result:
706            parsed_result[name].append(value)
707        else:
708            parsed_result[name] = [value]
709    return parsed_result
710
711
712def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
713              encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
714    """Parse a query given as a string argument.
715
716        Arguments:
717
718        qs: percent-encoded query string to be parsed
719
720        keep_blank_values: flag indicating whether blank values in
721            percent-encoded queries should be treated as blank strings.
722            A true value indicates that blanks should be retained as blank
723            strings.  The default false value indicates that blank values
724            are to be ignored and treated as if they were  not included.
725
726        strict_parsing: flag indicating what to do with parsing errors. If
727            false (the default), errors are silently ignored. If true,
728            errors raise a ValueError exception.
729
730        encoding and errors: specify how to decode percent-encoded sequences
731            into Unicode characters, as accepted by the bytes.decode() method.
732
733        max_num_fields: int. If set, then throws a ValueError
734            if there are more than n fields read by parse_qsl().
735
736        separator: str. The symbol to use for separating the query arguments.
737            Defaults to &.
738
739        Returns a list, as G-d intended.
740    """
741    qs, _coerce_result = _coerce_args(qs)
742    separator, _ = _coerce_args(separator)
743
744    if not separator or (not isinstance(separator, (str, bytes))):
745        raise ValueError("Separator must be of type string or bytes.")
746
747    # If max_num_fields is defined then check that the number of fields
748    # is less than max_num_fields. This prevents a memory exhaustion DOS
749    # attack via post bodies with many fields.
750    if max_num_fields is not None:
751        num_fields = 1 + qs.count(separator)
752        if max_num_fields < num_fields:
753            raise ValueError('Max number of fields exceeded')
754
755    r = []
756    for name_value in qs.split(separator):
757        if not name_value and not strict_parsing:
758            continue
759        nv = name_value.split('=', 1)
760        if len(nv) != 2:
761            if strict_parsing:
762                raise ValueError("bad query field: %r" % (name_value,))
763            # Handle case of a control-name with no equal sign
764            if keep_blank_values:
765                nv.append('')
766            else:
767                continue
768        if len(nv[1]) or keep_blank_values:
769            name = nv[0].replace('+', ' ')
770            name = unquote(name, encoding=encoding, errors=errors)
771            name = _coerce_result(name)
772            value = nv[1].replace('+', ' ')
773            value = unquote(value, encoding=encoding, errors=errors)
774            value = _coerce_result(value)
775            r.append((name, value))
776    return r
777
778def unquote_plus(string, encoding='utf-8', errors='replace'):
779    """Like unquote(), but also replace plus signs by spaces, as required for
780    unquoting HTML form values.
781
782    unquote_plus('%7e/abc+def') -> '~/abc def'
783    """
784    string = string.replace('+', ' ')
785    return unquote(string, encoding, errors)
786
787_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
788                         b'abcdefghijklmnopqrstuvwxyz'
789                         b'0123456789'
790                         b'_.-~')
791_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
792_safe_quoters = {}
793
794class Quoter(collections.defaultdict):
795    """A mapping from bytes (in range(0,256)) to strings.
796
797    String values are percent-encoded byte values, unless the key < 128, and
798    in the "safe" set (either the specified safe set, or default set).
799    """
800    # Keeps a cache internally, using defaultdict, for efficiency (lookups
801    # of cached keys don't call Python code at all).
802    def __init__(self, safe):
803        """safe: bytes object."""
804        self.safe = _ALWAYS_SAFE.union(safe)
805
806    def __repr__(self):
807        # Without this, will just display as a defaultdict
808        return "<%s %r>" % (self.__class__.__name__, dict(self))
809
810    def __missing__(self, b):
811        # Handle a cache miss. Store quoted string in cache and return.
812        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
813        self[b] = res
814        return res
815
816def quote(string, safe='/', encoding=None, errors=None):
817    """quote('abc def') -> 'abc%20def'
818
819    Each part of a URL, e.g. the path info, the query, etc., has a
820    different set of reserved characters that must be quoted. The
821    quote function offers a cautious (not minimal) way to quote a
822    string for most of these parts.
823
824    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
825    the following (un)reserved characters.
826
827    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
828    reserved      = gen-delims / sub-delims
829    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
830    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
831                  / "*" / "+" / "," / ";" / "="
832
833    Each of the reserved characters is reserved in some component of a URL,
834    but not necessarily in all of them.
835
836    The quote function %-escapes all characters that are neither in the
837    unreserved chars ("always safe") nor the additional chars set via the
838    safe arg.
839
840    The default for the safe arg is '/'. The character is reserved, but in
841    typical usage the quote function is being called on a path where the
842    existing slash characters are to be preserved.
843
844    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
845    Now, "~" is included in the set of unreserved characters.
846
847    string and safe may be either str or bytes objects. encoding and errors
848    must not be specified if string is a bytes object.
849
850    The optional encoding and errors parameters specify how to deal with
851    non-ASCII characters, as accepted by the str.encode method.
852    By default, encoding='utf-8' (characters are encoded with UTF-8), and
853    errors='strict' (unsupported characters raise a UnicodeEncodeError).
854    """
855    if isinstance(string, str):
856        if not string:
857            return string
858        if encoding is None:
859            encoding = 'utf-8'
860        if errors is None:
861            errors = 'strict'
862        string = string.encode(encoding, errors)
863    else:
864        if encoding is not None:
865            raise TypeError("quote() doesn't support 'encoding' for bytes")
866        if errors is not None:
867            raise TypeError("quote() doesn't support 'errors' for bytes")
868    return quote_from_bytes(string, safe)
869
870def quote_plus(string, safe='', encoding=None, errors=None):
871    """Like quote(), but also replace ' ' with '+', as required for quoting
872    HTML form values. Plus signs in the original string are escaped unless
873    they are included in safe. It also does not have safe default to '/'.
874    """
875    # Check if ' ' in string, where string may either be a str or bytes.  If
876    # there are no spaces, the regular quote will produce the right answer.
877    if ((isinstance(string, str) and ' ' not in string) or
878        (isinstance(string, bytes) and b' ' not in string)):
879        return quote(string, safe, encoding, errors)
880    if isinstance(safe, str):
881        space = ' '
882    else:
883        space = b' '
884    string = quote(string, safe + space, encoding, errors)
885    return string.replace(' ', '+')
886
887def quote_from_bytes(bs, safe='/'):
888    """Like quote(), but accepts a bytes object rather than a str, and does
889    not perform string-to-bytes encoding.  It always returns an ASCII string.
890    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
891    """
892    if not isinstance(bs, (bytes, bytearray)):
893        raise TypeError("quote_from_bytes() expected bytes")
894    if not bs:
895        return ''
896    if isinstance(safe, str):
897        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
898        safe = safe.encode('ascii', 'ignore')
899    else:
900        safe = bytes([c for c in safe if c < 128])
901    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
902        return bs.decode()
903    try:
904        quoter = _safe_quoters[safe]
905    except KeyError:
906        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
907    return ''.join([quoter(char) for char in bs])
908
909def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
910              quote_via=quote_plus):
911    """Encode a dict or sequence of two-element tuples into a URL query string.
912
913    If any values in the query arg are sequences and doseq is true, each
914    sequence element is converted to a separate parameter.
915
916    If the query arg is a sequence of two-element tuples, the order of the
917    parameters in the output will match the order of parameters in the
918    input.
919
920    The components of a query arg may each be either a string or a bytes type.
921
922    The safe, encoding, and errors parameters are passed down to the function
923    specified by quote_via (encoding and errors only if a component is a str).
924    """
925
926    if hasattr(query, "items"):
927        query = query.items()
928    else:
929        # It's a bother at times that strings and string-like objects are
930        # sequences.
931        try:
932            # non-sequence items should not work with len()
933            # non-empty strings will fail this
934            if len(query) and not isinstance(query[0], tuple):
935                raise TypeError
936            # Zero-length sequences of all types will get here and succeed,
937            # but that's a minor nit.  Since the original implementation
938            # allowed empty dicts that type of behavior probably should be
939            # preserved for consistency
940        except TypeError:
941            ty, va, tb = sys.exc_info()
942            raise TypeError("not a valid non-string sequence "
943                            "or mapping object").with_traceback(tb)
944
945    l = []
946    if not doseq:
947        for k, v in query:
948            if isinstance(k, bytes):
949                k = quote_via(k, safe)
950            else:
951                k = quote_via(str(k), safe, encoding, errors)
952
953            if isinstance(v, bytes):
954                v = quote_via(v, safe)
955            else:
956                v = quote_via(str(v), safe, encoding, errors)
957            l.append(k + '=' + v)
958    else:
959        for k, v in query:
960            if isinstance(k, bytes):
961                k = quote_via(k, safe)
962            else:
963                k = quote_via(str(k), safe, encoding, errors)
964
965            if isinstance(v, bytes):
966                v = quote_via(v, safe)
967                l.append(k + '=' + v)
968            elif isinstance(v, str):
969                v = quote_via(v, safe, encoding, errors)
970                l.append(k + '=' + v)
971            else:
972                try:
973                    # Is this a sufficient test for sequence-ness?
974                    x = len(v)
975                except TypeError:
976                    # not a sequence
977                    v = quote_via(str(v), safe, encoding, errors)
978                    l.append(k + '=' + v)
979                else:
980                    # loop over the sequence
981                    for elt in v:
982                        if isinstance(elt, bytes):
983                            elt = quote_via(elt, safe)
984                        else:
985                            elt = quote_via(str(elt), safe, encoding, errors)
986                        l.append(k + '=' + elt)
987    return '&'.join(l)
988
989
990def to_bytes(url):
991    warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
992                  DeprecationWarning, stacklevel=2)
993    return _to_bytes(url)
994
995
996def _to_bytes(url):
997    """to_bytes(u"URL") --> 'URL'."""
998    # Most URL schemes require ASCII. If that changes, the conversion
999    # can be relaxed.
1000    # XXX get rid of to_bytes()
1001    if isinstance(url, str):
1002        try:
1003            url = url.encode("ASCII").decode()
1004        except UnicodeError:
1005            raise UnicodeError("URL " + repr(url) +
1006                               " contains non-ASCII characters")
1007    return url
1008
1009
1010def unwrap(url):
1011    """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
1012
1013    The string is returned unchanged if it's not a wrapped URL.
1014    """
1015    url = str(url).strip()
1016    if url[:1] == '<' and url[-1:] == '>':
1017        url = url[1:-1].strip()
1018    if url[:4] == 'URL:':
1019        url = url[4:].strip()
1020    return url
1021
1022
1023def splittype(url):
1024    warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1025                  "use urllib.parse.urlparse() instead",
1026                  DeprecationWarning, stacklevel=2)
1027    return _splittype(url)
1028
1029
1030_typeprog = None
1031def _splittype(url):
1032    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1033    global _typeprog
1034    if _typeprog is None:
1035        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
1036
1037    match = _typeprog.match(url)
1038    if match:
1039        scheme, data = match.groups()
1040        return scheme.lower(), data
1041    return None, url
1042
1043
1044def splithost(url):
1045    warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1046                  "use urllib.parse.urlparse() instead",
1047                  DeprecationWarning, stacklevel=2)
1048    return _splithost(url)
1049
1050
1051_hostprog = None
1052def _splithost(url):
1053    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1054    global _hostprog
1055    if _hostprog is None:
1056        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1057
1058    match = _hostprog.match(url)
1059    if match:
1060        host_port, path = match.groups()
1061        if path and path[0] != '/':
1062            path = '/' + path
1063        return host_port, path
1064    return None, url
1065
1066
1067def splituser(host):
1068    warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1069                  "use urllib.parse.urlparse() instead",
1070                  DeprecationWarning, stacklevel=2)
1071    return _splituser(host)
1072
1073
1074def _splituser(host):
1075    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1076    user, delim, host = host.rpartition('@')
1077    return (user if delim else None), host
1078
1079
1080def splitpasswd(user):
1081    warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1082                  "use urllib.parse.urlparse() instead",
1083                  DeprecationWarning, stacklevel=2)
1084    return _splitpasswd(user)
1085
1086
1087def _splitpasswd(user):
1088    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1089    user, delim, passwd = user.partition(':')
1090    return user, (passwd if delim else None)
1091
1092
1093def splitport(host):
1094    warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1095                  "use urllib.parse.urlparse() instead",
1096                  DeprecationWarning, stacklevel=2)
1097    return _splitport(host)
1098
1099
1100# splittag('/path#tag') --> '/path', 'tag'
1101_portprog = None
1102def _splitport(host):
1103    """splitport('host:port') --> 'host', 'port'."""
1104    global _portprog
1105    if _portprog is None:
1106        _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1107
1108    match = _portprog.fullmatch(host)
1109    if match:
1110        host, port = match.groups()
1111        if port:
1112            return host, port
1113    return host, None
1114
1115
1116def splitnport(host, defport=-1):
1117    warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1118                  "use urllib.parse.urlparse() instead",
1119                  DeprecationWarning, stacklevel=2)
1120    return _splitnport(host, defport)
1121
1122
1123def _splitnport(host, defport=-1):
1124    """Split host and port, returning numeric port.
1125    Return given default port if no ':' found; defaults to -1.
1126    Return numerical port if a valid number are found after ':'.
1127    Return None if ':' but not a valid number."""
1128    host, delim, port = host.rpartition(':')
1129    if not delim:
1130        host = port
1131    elif port:
1132        try:
1133            nport = int(port)
1134        except ValueError:
1135            nport = None
1136        return host, nport
1137    return host, defport
1138
1139
1140def splitquery(url):
1141    warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1142                  "use urllib.parse.urlparse() instead",
1143                  DeprecationWarning, stacklevel=2)
1144    return _splitquery(url)
1145
1146
1147def _splitquery(url):
1148    """splitquery('/path?query') --> '/path', 'query'."""
1149    path, delim, query = url.rpartition('?')
1150    if delim:
1151        return path, query
1152    return url, None
1153
1154
1155def splittag(url):
1156    warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1157                  "use urllib.parse.urlparse() instead",
1158                  DeprecationWarning, stacklevel=2)
1159    return _splittag(url)
1160
1161
1162def _splittag(url):
1163    """splittag('/path#tag') --> '/path', 'tag'."""
1164    path, delim, tag = url.rpartition('#')
1165    if delim:
1166        return path, tag
1167    return url, None
1168
1169
1170def splitattr(url):
1171    warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1172                  "use urllib.parse.urlparse() instead",
1173                  DeprecationWarning, stacklevel=2)
1174    return _splitattr(url)
1175
1176
1177def _splitattr(url):
1178    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1179        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1180    words = url.split(';')
1181    return words[0], words[1:]
1182
1183
1184def splitvalue(attr):
1185    warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1186                  "use urllib.parse.parse_qsl() instead",
1187                  DeprecationWarning, stacklevel=2)
1188    return _splitvalue(attr)
1189
1190
1191def _splitvalue(attr):
1192    """splitvalue('attr=value') --> 'attr', 'value'."""
1193    attr, delim, value = attr.partition('=')
1194    return attr, (value if delim else None)
1195