• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import collections
33import warnings
34
35__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
36           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
37           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
38           "unquote", "unquote_plus", "unquote_to_bytes",
39           "DefragResult", "ParseResult", "SplitResult",
40           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
41
42# A classification of schemes.
43# The empty string classifies URLs with no scheme specified,
44# being the default value returned by “urlsplit” and “urlparse”.
45
46uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
47                 'wais', 'file', 'https', 'shttp', 'mms',
48                 'prospero', 'rtsp', 'rtspu', 'sftp',
49                 'svn', 'svn+ssh', 'ws', 'wss']
50
51uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
52               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
53               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
54               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
55               'ws', 'wss']
56
57uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
58               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
59               'mms', 'sftp', 'tel']
60
61# These are not actually used anymore, but should stay for backwards
62# compatibility.  (They are undocumented, but have a public-looking name.)
63
64non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
65                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
66
67uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
68              'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
69
70uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
71                 'nntp', 'wais', 'https', 'shttp', 'snews',
72                 'file', 'prospero']
73
74# Characters valid in scheme names
75scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
76                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
77                '0123456789'
78                '+-.')
79
80# XXX: Consider replacing with functools.lru_cache
81MAX_CACHE_SIZE = 20
82_parse_cache = {}
83
84def clear_cache():
85    """Clear the parse cache and the quoters cache."""
86    _parse_cache.clear()
87    _safe_quoters.clear()
88
89
90# Helpers for bytes handling
91# For 3.2, we deliberately require applications that
92# handle improperly quoted URLs to do their own
93# decoding and encoding. If valid use cases are
94# presented, we may relax this by using latin-1
95# decoding internally for 3.3
96_implicit_encoding = 'ascii'
97_implicit_errors = 'strict'
98
99def _noop(obj):
100    return obj
101
102def _encode_result(obj, encoding=_implicit_encoding,
103                        errors=_implicit_errors):
104    return obj.encode(encoding, errors)
105
106def _decode_args(args, encoding=_implicit_encoding,
107                       errors=_implicit_errors):
108    return tuple(x.decode(encoding, errors) if x else '' for x in args)
109
110def _coerce_args(*args):
111    # Invokes decode if necessary to create str args
112    # and returns the coerced inputs along with
113    # an appropriate result coercion function
114    #   - noop for str inputs
115    #   - encoding function otherwise
116    str_input = isinstance(args[0], str)
117    for arg in args[1:]:
118        # We special-case the empty string to support the
119        # "scheme=''" default argument to some functions
120        if arg and isinstance(arg, str) != str_input:
121            raise TypeError("Cannot mix str and non-str arguments")
122    if str_input:
123        return args + (_noop,)
124    return _decode_args(args) + (_encode_result,)
125
126# Result objects are more helpful than simple tuples
127class _ResultMixinStr(object):
128    """Standard approach to encoding parsed results from str to bytes"""
129    __slots__ = ()
130
131    def encode(self, encoding='ascii', errors='strict'):
132        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
133
134
135class _ResultMixinBytes(object):
136    """Standard approach to decoding parsed results from bytes to str"""
137    __slots__ = ()
138
139    def decode(self, encoding='ascii', errors='strict'):
140        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
141
142
143class _NetlocResultMixinBase(object):
144    """Shared methods for the parsed result objects containing a netloc element"""
145    __slots__ = ()
146
147    @property
148    def username(self):
149        return self._userinfo[0]
150
151    @property
152    def password(self):
153        return self._userinfo[1]
154
155    @property
156    def hostname(self):
157        hostname = self._hostinfo[0]
158        if not hostname:
159            return None
160        # Scoped IPv6 address may have zone info, which must not be lowercased
161        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
162        separator = '%' if isinstance(hostname, str) else b'%'
163        hostname, percent, zone = hostname.partition(separator)
164        return hostname.lower() + percent + zone
165
166    @property
167    def port(self):
168        port = self._hostinfo[1]
169        if port is not None:
170            try:
171                port = int(port, 10)
172            except ValueError:
173                message = f'Port could not be cast to integer value as {port!r}'
174                raise ValueError(message) from None
175            if not ( 0 <= port <= 65535):
176                raise ValueError("Port out of range 0-65535")
177        return port
178
179
180class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
181    __slots__ = ()
182
183    @property
184    def _userinfo(self):
185        netloc = self.netloc
186        userinfo, have_info, hostinfo = netloc.rpartition('@')
187        if have_info:
188            username, have_password, password = userinfo.partition(':')
189            if not have_password:
190                password = None
191        else:
192            username = password = None
193        return username, password
194
195    @property
196    def _hostinfo(self):
197        netloc = self.netloc
198        _, _, hostinfo = netloc.rpartition('@')
199        _, have_open_br, bracketed = hostinfo.partition('[')
200        if have_open_br:
201            hostname, _, port = bracketed.partition(']')
202            _, _, port = port.partition(':')
203        else:
204            hostname, _, port = hostinfo.partition(':')
205        if not port:
206            port = None
207        return hostname, port
208
209
210class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
211    __slots__ = ()
212
213    @property
214    def _userinfo(self):
215        netloc = self.netloc
216        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
217        if have_info:
218            username, have_password, password = userinfo.partition(b':')
219            if not have_password:
220                password = None
221        else:
222            username = password = None
223        return username, password
224
225    @property
226    def _hostinfo(self):
227        netloc = self.netloc
228        _, _, hostinfo = netloc.rpartition(b'@')
229        _, have_open_br, bracketed = hostinfo.partition(b'[')
230        if have_open_br:
231            hostname, _, port = bracketed.partition(b']')
232            _, _, port = port.partition(b':')
233        else:
234            hostname, _, port = hostinfo.partition(b':')
235        if not port:
236            port = None
237        return hostname, port
238
239
240from collections import namedtuple
241
242_DefragResultBase = namedtuple('DefragResult', 'url fragment')
243_SplitResultBase = namedtuple(
244    'SplitResult', 'scheme netloc path query fragment')
245_ParseResultBase = namedtuple(
246    'ParseResult', 'scheme netloc path params query fragment')
247
248_DefragResultBase.__doc__ = """
249DefragResult(url, fragment)
250
251A 2-tuple that contains the url without fragment identifier and the fragment
252identifier as a separate argument.
253"""
254
255_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
256
257_DefragResultBase.fragment.__doc__ = """
258Fragment identifier separated from URL, that allows indirect identification of a
259secondary resource by reference to a primary resource and additional identifying
260information.
261"""
262
263_SplitResultBase.__doc__ = """
264SplitResult(scheme, netloc, path, query, fragment)
265
266A 5-tuple that contains the different components of a URL. Similar to
267ParseResult, but does not split params.
268"""
269
270_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
271
272_SplitResultBase.netloc.__doc__ = """
273Network location where the request is made to.
274"""
275
276_SplitResultBase.path.__doc__ = """
277The hierarchical path, such as the path to a file to download.
278"""
279
280_SplitResultBase.query.__doc__ = """
281The query component, that contains non-hierarchical data, that along with data
282in path component, identifies a resource in the scope of URI's scheme and
283network location.
284"""
285
286_SplitResultBase.fragment.__doc__ = """
287Fragment identifier, that allows indirect identification of a secondary resource
288by reference to a primary resource and additional identifying information.
289"""
290
291_ParseResultBase.__doc__ = """
292ParseResult(scheme, netloc, path, params, query, fragment)
293
294A 6-tuple that contains components of a parsed URL.
295"""
296
297_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
298_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
299_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
300_ParseResultBase.params.__doc__ = """
301Parameters for last path element used to dereference the URI in order to provide
302access to perform some operation on the resource.
303"""
304
305_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
306_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
307
308
309# For backwards compatibility, alias _NetlocResultMixinStr
310# ResultBase is no longer part of the documented API, but it is
311# retained since deprecating it isn't worth the hassle
312ResultBase = _NetlocResultMixinStr
313
314# Structured result objects for string data
315class DefragResult(_DefragResultBase, _ResultMixinStr):
316    __slots__ = ()
317    def geturl(self):
318        if self.fragment:
319            return self.url + '#' + self.fragment
320        else:
321            return self.url
322
323class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
324    __slots__ = ()
325    def geturl(self):
326        return urlunsplit(self)
327
328class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
329    __slots__ = ()
330    def geturl(self):
331        return urlunparse(self)
332
333# Structured result objects for bytes data
334class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
335    __slots__ = ()
336    def geturl(self):
337        if self.fragment:
338            return self.url + b'#' + self.fragment
339        else:
340            return self.url
341
342class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
343    __slots__ = ()
344    def geturl(self):
345        return urlunsplit(self)
346
347class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
348    __slots__ = ()
349    def geturl(self):
350        return urlunparse(self)
351
352# Set up the encode/decode result pairs
353def _fix_result_transcoding():
354    _result_pairs = (
355        (DefragResult, DefragResultBytes),
356        (SplitResult, SplitResultBytes),
357        (ParseResult, ParseResultBytes),
358    )
359    for _decoded, _encoded in _result_pairs:
360        _decoded._encoded_counterpart = _encoded
361        _encoded._decoded_counterpart = _decoded
362
363_fix_result_transcoding()
364del _fix_result_transcoding
365
366def urlparse(url, scheme='', allow_fragments=True):
367    """Parse a URL into 6 components:
368    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
369    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
370    Note that we don't break the components up in smaller bits
371    (e.g. netloc is a single string) and we don't expand % escapes."""
372    url, scheme, _coerce_result = _coerce_args(url, scheme)
373    splitresult = urlsplit(url, scheme, allow_fragments)
374    scheme, netloc, url, query, fragment = splitresult
375    if scheme in uses_params and ';' in url:
376        url, params = _splitparams(url)
377    else:
378        params = ''
379    result = ParseResult(scheme, netloc, url, params, query, fragment)
380    return _coerce_result(result)
381
382def _splitparams(url):
383    if '/'  in url:
384        i = url.find(';', url.rfind('/'))
385        if i < 0:
386            return url, ''
387    else:
388        i = url.find(';')
389    return url[:i], url[i+1:]
390
391def _splitnetloc(url, start=0):
392    delim = len(url)   # position of end of domain part of url, default is end
393    for c in '/?#':    # look for delimiters; the order is NOT important
394        wdelim = url.find(c, start)        # find first of this delim
395        if wdelim >= 0:                    # if found
396            delim = min(delim, wdelim)     # use earliest delim position
397    return url[start:delim], url[delim:]   # return (domain, rest)
398
399def _checknetloc(netloc):
400    if not netloc or netloc.isascii():
401        return
402    # looking for characters like \u2100 that expand to 'a/c'
403    # IDNA uses NFKC equivalence, so normalize for this check
404    import unicodedata
405    n = netloc.replace('@', '')   # ignore characters already included
406    n = n.replace(':', '')        # but not the surrounding text
407    n = n.replace('#', '')
408    n = n.replace('?', '')
409    netloc2 = unicodedata.normalize('NFKC', n)
410    if n == netloc2:
411        return
412    for c in '/?#@:':
413        if c in netloc2:
414            raise ValueError("netloc '" + netloc + "' contains invalid " +
415                             "characters under NFKC normalization")
416
417def urlsplit(url, scheme='', allow_fragments=True):
418    """Parse a URL into 5 components:
419    <scheme>://<netloc>/<path>?<query>#<fragment>
420    Return a 5-tuple: (scheme, netloc, path, query, fragment).
421    Note that we don't break the components up in smaller bits
422    (e.g. netloc is a single string) and we don't expand % escapes."""
423    url, scheme, _coerce_result = _coerce_args(url, scheme)
424    allow_fragments = bool(allow_fragments)
425    key = url, scheme, allow_fragments, type(url), type(scheme)
426    cached = _parse_cache.get(key, None)
427    if cached:
428        return _coerce_result(cached)
429    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
430        clear_cache()
431    netloc = query = fragment = ''
432    i = url.find(':')
433    if i > 0:
434        if url[:i] == 'http': # optimize the common case
435            url = url[i+1:]
436            if url[:2] == '//':
437                netloc, url = _splitnetloc(url, 2)
438                if (('[' in netloc and ']' not in netloc) or
439                        (']' in netloc and '[' not in netloc)):
440                    raise ValueError("Invalid IPv6 URL")
441            if allow_fragments and '#' in url:
442                url, fragment = url.split('#', 1)
443            if '?' in url:
444                url, query = url.split('?', 1)
445            _checknetloc(netloc)
446            v = SplitResult('http', netloc, url, query, fragment)
447            _parse_cache[key] = v
448            return _coerce_result(v)
449        for c in url[:i]:
450            if c not in scheme_chars:
451                break
452        else:
453            # make sure "url" is not actually a port number (in which case
454            # "scheme" is really part of the path)
455            rest = url[i+1:]
456            if not rest or any(c not in '0123456789' for c in rest):
457                # not a port number
458                scheme, url = url[:i].lower(), rest
459
460    if url[:2] == '//':
461        netloc, url = _splitnetloc(url, 2)
462        if (('[' in netloc and ']' not in netloc) or
463                (']' in netloc and '[' not in netloc)):
464            raise ValueError("Invalid IPv6 URL")
465    if allow_fragments and '#' in url:
466        url, fragment = url.split('#', 1)
467    if '?' in url:
468        url, query = url.split('?', 1)
469    _checknetloc(netloc)
470    v = SplitResult(scheme, netloc, url, query, fragment)
471    _parse_cache[key] = v
472    return _coerce_result(v)
473
474def urlunparse(components):
475    """Put a parsed URL back together again.  This may result in a
476    slightly different, but equivalent URL, if the URL that was parsed
477    originally had redundant delimiters, e.g. a ? with an empty query
478    (the draft states that these are equivalent)."""
479    scheme, netloc, url, params, query, fragment, _coerce_result = (
480                                                  _coerce_args(*components))
481    if params:
482        url = "%s;%s" % (url, params)
483    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
484
485def urlunsplit(components):
486    """Combine the elements of a tuple as returned by urlsplit() into a
487    complete URL as a string. The data argument can be any five-item iterable.
488    This may result in a slightly different, but equivalent URL, if the URL that
489    was parsed originally had unnecessary delimiters (for example, a ? with an
490    empty query; the RFC states that these are equivalent)."""
491    scheme, netloc, url, query, fragment, _coerce_result = (
492                                          _coerce_args(*components))
493    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
494        if url and url[:1] != '/': url = '/' + url
495        url = '//' + (netloc or '') + url
496    if scheme:
497        url = scheme + ':' + url
498    if query:
499        url = url + '?' + query
500    if fragment:
501        url = url + '#' + fragment
502    return _coerce_result(url)
503
504def urljoin(base, url, allow_fragments=True):
505    """Join a base URL and a possibly relative URL to form an absolute
506    interpretation of the latter."""
507    if not base:
508        return url
509    if not url:
510        return base
511
512    base, url, _coerce_result = _coerce_args(base, url)
513    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
514            urlparse(base, '', allow_fragments)
515    scheme, netloc, path, params, query, fragment = \
516            urlparse(url, bscheme, allow_fragments)
517
518    if scheme != bscheme or scheme not in uses_relative:
519        return _coerce_result(url)
520    if scheme in uses_netloc:
521        if netloc:
522            return _coerce_result(urlunparse((scheme, netloc, path,
523                                              params, query, fragment)))
524        netloc = bnetloc
525
526    if not path and not params:
527        path = bpath
528        params = bparams
529        if not query:
530            query = bquery
531        return _coerce_result(urlunparse((scheme, netloc, path,
532                                          params, query, fragment)))
533
534    base_parts = bpath.split('/')
535    if base_parts[-1] != '':
536        # the last item is not a directory, so will not be taken into account
537        # in resolving the relative path
538        del base_parts[-1]
539
540    # for rfc3986, ignore all base path should the first character be root.
541    if path[:1] == '/':
542        segments = path.split('/')
543    else:
544        segments = base_parts + path.split('/')
545        # filter out elements that would cause redundant slashes on re-joining
546        # the resolved_path
547        segments[1:-1] = filter(None, segments[1:-1])
548
549    resolved_path = []
550
551    for seg in segments:
552        if seg == '..':
553            try:
554                resolved_path.pop()
555            except IndexError:
556                # ignore any .. segments that would otherwise cause an IndexError
557                # when popped from resolved_path if resolving for rfc3986
558                pass
559        elif seg == '.':
560            continue
561        else:
562            resolved_path.append(seg)
563
564    if segments[-1] in ('.', '..'):
565        # do some post-processing here. if the last segment was a relative dir,
566        # then we need to append the trailing '/'
567        resolved_path.append('')
568
569    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
570        resolved_path) or '/', params, query, fragment)))
571
572
573def urldefrag(url):
574    """Removes any existing fragment from URL.
575
576    Returns a tuple of the defragmented URL and the fragment.  If
577    the URL contained no fragments, the second element is the
578    empty string.
579    """
580    url, _coerce_result = _coerce_args(url)
581    if '#' in url:
582        s, n, p, a, q, frag = urlparse(url)
583        defrag = urlunparse((s, n, p, a, q, ''))
584    else:
585        frag = ''
586        defrag = url
587    return _coerce_result(DefragResult(defrag, frag))
588
589_hexdig = '0123456789ABCDEFabcdef'
590_hextobyte = None
591
592def unquote_to_bytes(string):
593    """unquote_to_bytes('abc%20def') -> b'abc def'."""
594    # Note: strings are encoded as UTF-8. This is only an issue if it contains
595    # unescaped non-ASCII characters, which URIs should not.
596    if not string:
597        # Is it a string-like object?
598        string.split
599        return b''
600    if isinstance(string, str):
601        string = string.encode('utf-8')
602    bits = string.split(b'%')
603    if len(bits) == 1:
604        return string
605    res = [bits[0]]
606    append = res.append
607    # Delay the initialization of the table to not waste memory
608    # if the function is never called
609    global _hextobyte
610    if _hextobyte is None:
611        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
612                      for a in _hexdig for b in _hexdig}
613    for item in bits[1:]:
614        try:
615            append(_hextobyte[item[:2]])
616            append(item[2:])
617        except KeyError:
618            append(b'%')
619            append(item)
620    return b''.join(res)
621
622_asciire = re.compile('([\x00-\x7f]+)')
623
624def unquote(string, encoding='utf-8', errors='replace'):
625    """Replace %xx escapes by their single-character equivalent. The optional
626    encoding and errors parameters specify how to decode percent-encoded
627    sequences into Unicode characters, as accepted by the bytes.decode()
628    method.
629    By default, percent-encoded sequences are decoded with UTF-8, and invalid
630    sequences are replaced by a placeholder character.
631
632    unquote('abc%20def') -> 'abc def'.
633    """
634    if '%' not in string:
635        string.split
636        return string
637    if encoding is None:
638        encoding = 'utf-8'
639    if errors is None:
640        errors = 'replace'
641    bits = _asciire.split(string)
642    res = [bits[0]]
643    append = res.append
644    for i in range(1, len(bits), 2):
645        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
646        append(bits[i + 1])
647    return ''.join(res)
648
649
650def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
651             encoding='utf-8', errors='replace', max_num_fields=None):
652    """Parse a query given as a string argument.
653
654        Arguments:
655
656        qs: percent-encoded query string to be parsed
657
658        keep_blank_values: flag indicating whether blank values in
659            percent-encoded queries should be treated as blank strings.
660            A true value indicates that blanks should be retained as
661            blank strings.  The default false value indicates that
662            blank values are to be ignored and treated as if they were
663            not included.
664
665        strict_parsing: flag indicating what to do with parsing errors.
666            If false (the default), errors are silently ignored.
667            If true, errors raise a ValueError exception.
668
669        encoding and errors: specify how to decode percent-encoded sequences
670            into Unicode characters, as accepted by the bytes.decode() method.
671
672        max_num_fields: int. If set, then throws a ValueError if there
673            are more than n fields read by parse_qsl().
674
675        Returns a dictionary.
676    """
677    parsed_result = {}
678    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
679                      encoding=encoding, errors=errors,
680                      max_num_fields=max_num_fields)
681    for name, value in pairs:
682        if name in parsed_result:
683            parsed_result[name].append(value)
684        else:
685            parsed_result[name] = [value]
686    return parsed_result
687
688
689def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
690              encoding='utf-8', errors='replace', max_num_fields=None):
691    """Parse a query given as a string argument.
692
693        Arguments:
694
695        qs: percent-encoded query string to be parsed
696
697        keep_blank_values: flag indicating whether blank values in
698            percent-encoded queries should be treated as blank strings.
699            A true value indicates that blanks should be retained as blank
700            strings.  The default false value indicates that blank values
701            are to be ignored and treated as if they were  not included.
702
703        strict_parsing: flag indicating what to do with parsing errors. If
704            false (the default), errors are silently ignored. If true,
705            errors raise a ValueError exception.
706
707        encoding and errors: specify how to decode percent-encoded sequences
708            into Unicode characters, as accepted by the bytes.decode() method.
709
710        max_num_fields: int. If set, then throws a ValueError
711            if there are more than n fields read by parse_qsl().
712
713        Returns a list, as G-d intended.
714    """
715    qs, _coerce_result = _coerce_args(qs)
716
717    # If max_num_fields is defined then check that the number of fields
718    # is less than max_num_fields. This prevents a memory exhaustion DOS
719    # attack via post bodies with many fields.
720    if max_num_fields is not None:
721        num_fields = 1 + qs.count('&') + qs.count(';')
722        if max_num_fields < num_fields:
723            raise ValueError('Max number of fields exceeded')
724
725    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
726    r = []
727    for name_value in pairs:
728        if not name_value and not strict_parsing:
729            continue
730        nv = name_value.split('=', 1)
731        if len(nv) != 2:
732            if strict_parsing:
733                raise ValueError("bad query field: %r" % (name_value,))
734            # Handle case of a control-name with no equal sign
735            if keep_blank_values:
736                nv.append('')
737            else:
738                continue
739        if len(nv[1]) or keep_blank_values:
740            name = nv[0].replace('+', ' ')
741            name = unquote(name, encoding=encoding, errors=errors)
742            name = _coerce_result(name)
743            value = nv[1].replace('+', ' ')
744            value = unquote(value, encoding=encoding, errors=errors)
745            value = _coerce_result(value)
746            r.append((name, value))
747    return r
748
749def unquote_plus(string, encoding='utf-8', errors='replace'):
750    """Like unquote(), but also replace plus signs by spaces, as required for
751    unquoting HTML form values.
752
753    unquote_plus('%7e/abc+def') -> '~/abc def'
754    """
755    string = string.replace('+', ' ')
756    return unquote(string, encoding, errors)
757
758_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
759                         b'abcdefghijklmnopqrstuvwxyz'
760                         b'0123456789'
761                         b'_.-~')
762_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
763_safe_quoters = {}
764
765class Quoter(collections.defaultdict):
766    """A mapping from bytes (in range(0,256)) to strings.
767
768    String values are percent-encoded byte values, unless the key < 128, and
769    in the "safe" set (either the specified safe set, or default set).
770    """
771    # Keeps a cache internally, using defaultdict, for efficiency (lookups
772    # of cached keys don't call Python code at all).
773    def __init__(self, safe):
774        """safe: bytes object."""
775        self.safe = _ALWAYS_SAFE.union(safe)
776
777    def __repr__(self):
778        # Without this, will just display as a defaultdict
779        return "<%s %r>" % (self.__class__.__name__, dict(self))
780
781    def __missing__(self, b):
782        # Handle a cache miss. Store quoted string in cache and return.
783        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
784        self[b] = res
785        return res
786
787def quote(string, safe='/', encoding=None, errors=None):
788    """quote('abc def') -> 'abc%20def'
789
790    Each part of a URL, e.g. the path info, the query, etc., has a
791    different set of reserved characters that must be quoted. The
792    quote function offers a cautious (not minimal) way to quote a
793    string for most of these parts.
794
795    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
796    the following (un)reserved characters.
797
798    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
799    reserved      = gen-delims / sub-delims
800    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
801    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
802                  / "*" / "+" / "," / ";" / "="
803
804    Each of the reserved characters is reserved in some component of a URL,
805    but not necessarily in all of them.
806
807    The quote function %-escapes all characters that are neither in the
808    unreserved chars ("always safe") nor the additional chars set via the
809    safe arg.
810
811    The default for the safe arg is '/'. The character is reserved, but in
812    typical usage the quote function is being called on a path where the
813    existing slash characters are to be preserved.
814
815    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
816    Now, "~" is included in the set of unreserved characters.
817
818    string and safe may be either str or bytes objects. encoding and errors
819    must not be specified if string is a bytes object.
820
821    The optional encoding and errors parameters specify how to deal with
822    non-ASCII characters, as accepted by the str.encode method.
823    By default, encoding='utf-8' (characters are encoded with UTF-8), and
824    errors='strict' (unsupported characters raise a UnicodeEncodeError).
825    """
826    if isinstance(string, str):
827        if not string:
828            return string
829        if encoding is None:
830            encoding = 'utf-8'
831        if errors is None:
832            errors = 'strict'
833        string = string.encode(encoding, errors)
834    else:
835        if encoding is not None:
836            raise TypeError("quote() doesn't support 'encoding' for bytes")
837        if errors is not None:
838            raise TypeError("quote() doesn't support 'errors' for bytes")
839    return quote_from_bytes(string, safe)
840
841def quote_plus(string, safe='', encoding=None, errors=None):
842    """Like quote(), but also replace ' ' with '+', as required for quoting
843    HTML form values. Plus signs in the original string are escaped unless
844    they are included in safe. It also does not have safe default to '/'.
845    """
846    # Check if ' ' in string, where string may either be a str or bytes.  If
847    # there are no spaces, the regular quote will produce the right answer.
848    if ((isinstance(string, str) and ' ' not in string) or
849        (isinstance(string, bytes) and b' ' not in string)):
850        return quote(string, safe, encoding, errors)
851    if isinstance(safe, str):
852        space = ' '
853    else:
854        space = b' '
855    string = quote(string, safe + space, encoding, errors)
856    return string.replace(' ', '+')
857
858def quote_from_bytes(bs, safe='/'):
859    """Like quote(), but accepts a bytes object rather than a str, and does
860    not perform string-to-bytes encoding.  It always returns an ASCII string.
861    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
862    """
863    if not isinstance(bs, (bytes, bytearray)):
864        raise TypeError("quote_from_bytes() expected bytes")
865    if not bs:
866        return ''
867    if isinstance(safe, str):
868        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
869        safe = safe.encode('ascii', 'ignore')
870    else:
871        safe = bytes([c for c in safe if c < 128])
872    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
873        return bs.decode()
874    try:
875        quoter = _safe_quoters[safe]
876    except KeyError:
877        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
878    return ''.join([quoter(char) for char in bs])
879
880def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
881              quote_via=quote_plus):
882    """Encode a dict or sequence of two-element tuples into a URL query string.
883
884    If any values in the query arg are sequences and doseq is true, each
885    sequence element is converted to a separate parameter.
886
887    If the query arg is a sequence of two-element tuples, the order of the
888    parameters in the output will match the order of parameters in the
889    input.
890
891    The components of a query arg may each be either a string or a bytes type.
892
893    The safe, encoding, and errors parameters are passed down to the function
894    specified by quote_via (encoding and errors only if a component is a str).
895    """
896
897    if hasattr(query, "items"):
898        query = query.items()
899    else:
900        # It's a bother at times that strings and string-like objects are
901        # sequences.
902        try:
903            # non-sequence items should not work with len()
904            # non-empty strings will fail this
905            if len(query) and not isinstance(query[0], tuple):
906                raise TypeError
907            # Zero-length sequences of all types will get here and succeed,
908            # but that's a minor nit.  Since the original implementation
909            # allowed empty dicts that type of behavior probably should be
910            # preserved for consistency
911        except TypeError:
912            ty, va, tb = sys.exc_info()
913            raise TypeError("not a valid non-string sequence "
914                            "or mapping object").with_traceback(tb)
915
916    l = []
917    if not doseq:
918        for k, v in query:
919            if isinstance(k, bytes):
920                k = quote_via(k, safe)
921            else:
922                k = quote_via(str(k), safe, encoding, errors)
923
924            if isinstance(v, bytes):
925                v = quote_via(v, safe)
926            else:
927                v = quote_via(str(v), safe, encoding, errors)
928            l.append(k + '=' + v)
929    else:
930        for k, v in query:
931            if isinstance(k, bytes):
932                k = quote_via(k, safe)
933            else:
934                k = quote_via(str(k), safe, encoding, errors)
935
936            if isinstance(v, bytes):
937                v = quote_via(v, safe)
938                l.append(k + '=' + v)
939            elif isinstance(v, str):
940                v = quote_via(v, safe, encoding, errors)
941                l.append(k + '=' + v)
942            else:
943                try:
944                    # Is this a sufficient test for sequence-ness?
945                    x = len(v)
946                except TypeError:
947                    # not a sequence
948                    v = quote_via(str(v), safe, encoding, errors)
949                    l.append(k + '=' + v)
950                else:
951                    # loop over the sequence
952                    for elt in v:
953                        if isinstance(elt, bytes):
954                            elt = quote_via(elt, safe)
955                        else:
956                            elt = quote_via(str(elt), safe, encoding, errors)
957                        l.append(k + '=' + elt)
958    return '&'.join(l)
959
960
961def to_bytes(url):
962    warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
963                  DeprecationWarning, stacklevel=2)
964    return _to_bytes(url)
965
966
967def _to_bytes(url):
968    """to_bytes(u"URL") --> 'URL'."""
969    # Most URL schemes require ASCII. If that changes, the conversion
970    # can be relaxed.
971    # XXX get rid of to_bytes()
972    if isinstance(url, str):
973        try:
974            url = url.encode("ASCII").decode()
975        except UnicodeError:
976            raise UnicodeError("URL " + repr(url) +
977                               " contains non-ASCII characters")
978    return url
979
980
981def unwrap(url):
982    """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
983
984    The string is returned unchanged if it's not a wrapped URL.
985    """
986    url = str(url).strip()
987    if url[:1] == '<' and url[-1:] == '>':
988        url = url[1:-1].strip()
989    if url[:4] == 'URL:':
990        url = url[4:].strip()
991    return url
992
993
994def splittype(url):
995    warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
996                  "use urllib.parse.urlparse() instead",
997                  DeprecationWarning, stacklevel=2)
998    return _splittype(url)
999
1000
1001_typeprog = None
1002def _splittype(url):
1003    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1004    global _typeprog
1005    if _typeprog is None:
1006        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
1007
1008    match = _typeprog.match(url)
1009    if match:
1010        scheme, data = match.groups()
1011        return scheme.lower(), data
1012    return None, url
1013
1014
1015def splithost(url):
1016    warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1017                  "use urllib.parse.urlparse() instead",
1018                  DeprecationWarning, stacklevel=2)
1019    return _splithost(url)
1020
1021
1022_hostprog = None
1023def _splithost(url):
1024    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1025    global _hostprog
1026    if _hostprog is None:
1027        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1028
1029    match = _hostprog.match(url)
1030    if match:
1031        host_port, path = match.groups()
1032        if path and path[0] != '/':
1033            path = '/' + path
1034        return host_port, path
1035    return None, url
1036
1037
1038def splituser(host):
1039    warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1040                  "use urllib.parse.urlparse() instead",
1041                  DeprecationWarning, stacklevel=2)
1042    return _splituser(host)
1043
1044
1045def _splituser(host):
1046    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1047    user, delim, host = host.rpartition('@')
1048    return (user if delim else None), host
1049
1050
1051def splitpasswd(user):
1052    warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1053                  "use urllib.parse.urlparse() instead",
1054                  DeprecationWarning, stacklevel=2)
1055    return _splitpasswd(user)
1056
1057
1058def _splitpasswd(user):
1059    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1060    user, delim, passwd = user.partition(':')
1061    return user, (passwd if delim else None)
1062
1063
1064def splitport(host):
1065    warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1066                  "use urllib.parse.urlparse() instead",
1067                  DeprecationWarning, stacklevel=2)
1068    return _splitport(host)
1069
1070
1071# splittag('/path#tag') --> '/path', 'tag'
1072_portprog = None
1073def _splitport(host):
1074    """splitport('host:port') --> 'host', 'port'."""
1075    global _portprog
1076    if _portprog is None:
1077        _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1078
1079    match = _portprog.fullmatch(host)
1080    if match:
1081        host, port = match.groups()
1082        if port:
1083            return host, port
1084    return host, None
1085
1086
1087def splitnport(host, defport=-1):
1088    warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1089                  "use urllib.parse.urlparse() instead",
1090                  DeprecationWarning, stacklevel=2)
1091    return _splitnport(host, defport)
1092
1093
1094def _splitnport(host, defport=-1):
1095    """Split host and port, returning numeric port.
1096    Return given default port if no ':' found; defaults to -1.
1097    Return numerical port if a valid number are found after ':'.
1098    Return None if ':' but not a valid number."""
1099    host, delim, port = host.rpartition(':')
1100    if not delim:
1101        host = port
1102    elif port:
1103        try:
1104            nport = int(port)
1105        except ValueError:
1106            nport = None
1107        return host, nport
1108    return host, defport
1109
1110
1111def splitquery(url):
1112    warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1113                  "use urllib.parse.urlparse() instead",
1114                  DeprecationWarning, stacklevel=2)
1115    return _splitquery(url)
1116
1117
1118def _splitquery(url):
1119    """splitquery('/path?query') --> '/path', 'query'."""
1120    path, delim, query = url.rpartition('?')
1121    if delim:
1122        return path, query
1123    return url, None
1124
1125
1126def splittag(url):
1127    warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1128                  "use urllib.parse.urlparse() instead",
1129                  DeprecationWarning, stacklevel=2)
1130    return _splittag(url)
1131
1132
1133def _splittag(url):
1134    """splittag('/path#tag') --> '/path', 'tag'."""
1135    path, delim, tag = url.rpartition('#')
1136    if delim:
1137        return path, tag
1138    return url, None
1139
1140
1141def splitattr(url):
1142    warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1143                  "use urllib.parse.urlparse() instead",
1144                  DeprecationWarning, stacklevel=2)
1145    return _splitattr(url)
1146
1147
1148def _splitattr(url):
1149    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1150        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1151    words = url.split(';')
1152    return words[0], words[1:]
1153
1154
1155def splitvalue(attr):
1156    warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1157                  "use urllib.parse.parse_qsl() instead",
1158                  DeprecationWarning, stacklevel=2)
1159    return _splitvalue(attr)
1160
1161
1162def _splitvalue(attr):
1163    """splitvalue('attr=value') --> 'attr', 'value'."""
1164    attr, delim, value = attr.partition('=')
1165    return attr, (value if delim else None)
1166