• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import types
33import collections
34import warnings
35
36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
37           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
38           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
39           "unquote", "unquote_plus", "unquote_to_bytes",
40           "DefragResult", "ParseResult", "SplitResult",
41           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
42
43# A classification of schemes.
44# The empty string classifies URLs with no scheme specified,
45# being the default value returned by “urlsplit” and “urlparse”.
46
47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
48                 'wais', 'file', 'https', 'shttp', 'mms',
49                 'prospero', 'rtsp', 'rtspu', 'sftp',
50                 'svn', 'svn+ssh', 'ws', 'wss']
51
52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
53               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
54               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
55               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
56               'ws', 'wss']
57
58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
59               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
60               'mms', 'sftp', 'tel']
61
62# These are not actually used anymore, but should stay for backwards
63# compatibility.  (They are undocumented, but have a public-looking name.)
64
65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
66                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
67
68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
69              'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
70
71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
72                 'nntp', 'wais', 'https', 'shttp', 'snews',
73                 'file', 'prospero']
74
75# Characters valid in scheme names
76scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
77                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
78                '0123456789'
79                '+-.')
80
81# XXX: Consider replacing with functools.lru_cache
82MAX_CACHE_SIZE = 20
83_parse_cache = {}
84
85def clear_cache():
86    """Clear the parse cache and the quoters cache."""
87    _parse_cache.clear()
88    _safe_quoters.clear()
89
90
91# Helpers for bytes handling
92# For 3.2, we deliberately require applications that
93# handle improperly quoted URLs to do their own
94# decoding and encoding. If valid use cases are
95# presented, we may relax this by using latin-1
96# decoding internally for 3.3
97_implicit_encoding = 'ascii'
98_implicit_errors = 'strict'
99
100def _noop(obj):
101    return obj
102
103def _encode_result(obj, encoding=_implicit_encoding,
104                        errors=_implicit_errors):
105    return obj.encode(encoding, errors)
106
107def _decode_args(args, encoding=_implicit_encoding,
108                       errors=_implicit_errors):
109    return tuple(x.decode(encoding, errors) if x else '' for x in args)
110
111def _coerce_args(*args):
112    # Invokes decode if necessary to create str args
113    # and returns the coerced inputs along with
114    # an appropriate result coercion function
115    #   - noop for str inputs
116    #   - encoding function otherwise
117    str_input = isinstance(args[0], str)
118    for arg in args[1:]:
119        # We special-case the empty string to support the
120        # "scheme=''" default argument to some functions
121        if arg and isinstance(arg, str) != str_input:
122            raise TypeError("Cannot mix str and non-str arguments")
123    if str_input:
124        return args + (_noop,)
125    return _decode_args(args) + (_encode_result,)
126
127# Result objects are more helpful than simple tuples
128class _ResultMixinStr(object):
129    """Standard approach to encoding parsed results from str to bytes"""
130    __slots__ = ()
131
132    def encode(self, encoding='ascii', errors='strict'):
133        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
134
135
136class _ResultMixinBytes(object):
137    """Standard approach to decoding parsed results from bytes to str"""
138    __slots__ = ()
139
140    def decode(self, encoding='ascii', errors='strict'):
141        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
142
143
144class _NetlocResultMixinBase(object):
145    """Shared methods for the parsed result objects containing a netloc element"""
146    __slots__ = ()
147
148    @property
149    def username(self):
150        return self._userinfo[0]
151
152    @property
153    def password(self):
154        return self._userinfo[1]
155
156    @property
157    def hostname(self):
158        hostname = self._hostinfo[0]
159        if not hostname:
160            return None
161        # Scoped IPv6 address may have zone info, which must not be lowercased
162        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
163        separator = '%' if isinstance(hostname, str) else b'%'
164        hostname, percent, zone = hostname.partition(separator)
165        return hostname.lower() + percent + zone
166
167    @property
168    def port(self):
169        port = self._hostinfo[1]
170        if port is not None:
171            try:
172                port = int(port, 10)
173            except ValueError:
174                message = f'Port could not be cast to integer value as {port!r}'
175                raise ValueError(message) from None
176            if not ( 0 <= port <= 65535):
177                raise ValueError("Port out of range 0-65535")
178        return port
179
180    __class_getitem__ = classmethod(types.GenericAlias)
181
182
183class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
184    __slots__ = ()
185
186    @property
187    def _userinfo(self):
188        netloc = self.netloc
189        userinfo, have_info, hostinfo = netloc.rpartition('@')
190        if have_info:
191            username, have_password, password = userinfo.partition(':')
192            if not have_password:
193                password = None
194        else:
195            username = password = None
196        return username, password
197
198    @property
199    def _hostinfo(self):
200        netloc = self.netloc
201        _, _, hostinfo = netloc.rpartition('@')
202        _, have_open_br, bracketed = hostinfo.partition('[')
203        if have_open_br:
204            hostname, _, port = bracketed.partition(']')
205            _, _, port = port.partition(':')
206        else:
207            hostname, _, port = hostinfo.partition(':')
208        if not port:
209            port = None
210        return hostname, port
211
212
213class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
214    __slots__ = ()
215
216    @property
217    def _userinfo(self):
218        netloc = self.netloc
219        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
220        if have_info:
221            username, have_password, password = userinfo.partition(b':')
222            if not have_password:
223                password = None
224        else:
225            username = password = None
226        return username, password
227
228    @property
229    def _hostinfo(self):
230        netloc = self.netloc
231        _, _, hostinfo = netloc.rpartition(b'@')
232        _, have_open_br, bracketed = hostinfo.partition(b'[')
233        if have_open_br:
234            hostname, _, port = bracketed.partition(b']')
235            _, _, port = port.partition(b':')
236        else:
237            hostname, _, port = hostinfo.partition(b':')
238        if not port:
239            port = None
240        return hostname, port
241
242
243from collections import namedtuple
244
245_DefragResultBase = namedtuple('DefragResult', 'url fragment')
246_SplitResultBase = namedtuple(
247    'SplitResult', 'scheme netloc path query fragment')
248_ParseResultBase = namedtuple(
249    'ParseResult', 'scheme netloc path params query fragment')
250
251_DefragResultBase.__doc__ = """
252DefragResult(url, fragment)
253
254A 2-tuple that contains the url without fragment identifier and the fragment
255identifier as a separate argument.
256"""
257
258_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
259
260_DefragResultBase.fragment.__doc__ = """
261Fragment identifier separated from URL, that allows indirect identification of a
262secondary resource by reference to a primary resource and additional identifying
263information.
264"""
265
266_SplitResultBase.__doc__ = """
267SplitResult(scheme, netloc, path, query, fragment)
268
269A 5-tuple that contains the different components of a URL. Similar to
270ParseResult, but does not split params.
271"""
272
273_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
274
275_SplitResultBase.netloc.__doc__ = """
276Network location where the request is made to.
277"""
278
279_SplitResultBase.path.__doc__ = """
280The hierarchical path, such as the path to a file to download.
281"""
282
283_SplitResultBase.query.__doc__ = """
284The query component, that contains non-hierarchical data, that along with data
285in path component, identifies a resource in the scope of URI's scheme and
286network location.
287"""
288
289_SplitResultBase.fragment.__doc__ = """
290Fragment identifier, that allows indirect identification of a secondary resource
291by reference to a primary resource and additional identifying information.
292"""
293
294_ParseResultBase.__doc__ = """
295ParseResult(scheme, netloc, path, params, query, fragment)
296
297A 6-tuple that contains components of a parsed URL.
298"""
299
300_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
301_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
302_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
303_ParseResultBase.params.__doc__ = """
304Parameters for last path element used to dereference the URI in order to provide
305access to perform some operation on the resource.
306"""
307
308_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
309_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
310
311
312# For backwards compatibility, alias _NetlocResultMixinStr
313# ResultBase is no longer part of the documented API, but it is
314# retained since deprecating it isn't worth the hassle
315ResultBase = _NetlocResultMixinStr
316
317# Structured result objects for string data
318class DefragResult(_DefragResultBase, _ResultMixinStr):
319    __slots__ = ()
320    def geturl(self):
321        if self.fragment:
322            return self.url + '#' + self.fragment
323        else:
324            return self.url
325
326class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
327    __slots__ = ()
328    def geturl(self):
329        return urlunsplit(self)
330
331class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
332    __slots__ = ()
333    def geturl(self):
334        return urlunparse(self)
335
336# Structured result objects for bytes data
337class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
338    __slots__ = ()
339    def geturl(self):
340        if self.fragment:
341            return self.url + b'#' + self.fragment
342        else:
343            return self.url
344
345class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
346    __slots__ = ()
347    def geturl(self):
348        return urlunsplit(self)
349
350class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
351    __slots__ = ()
352    def geturl(self):
353        return urlunparse(self)
354
355# Set up the encode/decode result pairs
356def _fix_result_transcoding():
357    _result_pairs = (
358        (DefragResult, DefragResultBytes),
359        (SplitResult, SplitResultBytes),
360        (ParseResult, ParseResultBytes),
361    )
362    for _decoded, _encoded in _result_pairs:
363        _decoded._encoded_counterpart = _encoded
364        _encoded._decoded_counterpart = _decoded
365
366_fix_result_transcoding()
367del _fix_result_transcoding
368
369def urlparse(url, scheme='', allow_fragments=True):
370    """Parse a URL into 6 components:
371    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
372
373    The result is a named 6-tuple with fields corresponding to the
374    above. It is either a ParseResult or ParseResultBytes object,
375    depending on the type of the url parameter.
376
377    The username, password, hostname, and port sub-components of netloc
378    can also be accessed as attributes of the returned object.
379
380    The scheme argument provides the default value of the scheme
381    component when no scheme is found in url.
382
383    If allow_fragments is False, no attempt is made to separate the
384    fragment component from the previous component, which can be either
385    path or query.
386
387    Note that % escapes are not expanded.
388    """
389    url, scheme, _coerce_result = _coerce_args(url, scheme)
390    splitresult = urlsplit(url, scheme, allow_fragments)
391    scheme, netloc, url, query, fragment = splitresult
392    if scheme in uses_params and ';' in url:
393        url, params = _splitparams(url)
394    else:
395        params = ''
396    result = ParseResult(scheme, netloc, url, params, query, fragment)
397    return _coerce_result(result)
398
399def _splitparams(url):
400    if '/'  in url:
401        i = url.find(';', url.rfind('/'))
402        if i < 0:
403            return url, ''
404    else:
405        i = url.find(';')
406    return url[:i], url[i+1:]
407
408def _splitnetloc(url, start=0):
409    delim = len(url)   # position of end of domain part of url, default is end
410    for c in '/?#':    # look for delimiters; the order is NOT important
411        wdelim = url.find(c, start)        # find first of this delim
412        if wdelim >= 0:                    # if found
413            delim = min(delim, wdelim)     # use earliest delim position
414    return url[start:delim], url[delim:]   # return (domain, rest)
415
416def _checknetloc(netloc):
417    if not netloc or netloc.isascii():
418        return
419    # looking for characters like \u2100 that expand to 'a/c'
420    # IDNA uses NFKC equivalence, so normalize for this check
421    import unicodedata
422    n = netloc.replace('@', '')   # ignore characters already included
423    n = n.replace(':', '')        # but not the surrounding text
424    n = n.replace('#', '')
425    n = n.replace('?', '')
426    netloc2 = unicodedata.normalize('NFKC', n)
427    if n == netloc2:
428        return
429    for c in '/?#@:':
430        if c in netloc2:
431            raise ValueError("netloc '" + netloc + "' contains invalid " +
432                             "characters under NFKC normalization")
433
434def urlsplit(url, scheme='', allow_fragments=True):
435    """Parse a URL into 5 components:
436    <scheme>://<netloc>/<path>?<query>#<fragment>
437
438    The result is a named 5-tuple with fields corresponding to the
439    above. It is either a SplitResult or SplitResultBytes object,
440    depending on the type of the url parameter.
441
442    The username, password, hostname, and port sub-components of netloc
443    can also be accessed as attributes of the returned object.
444
445    The scheme argument provides the default value of the scheme
446    component when no scheme is found in url.
447
448    If allow_fragments is False, no attempt is made to separate the
449    fragment component from the previous component, which can be either
450    path or query.
451
452    Note that % escapes are not expanded.
453    """
454
455    url, scheme, _coerce_result = _coerce_args(url, scheme)
456    allow_fragments = bool(allow_fragments)
457    key = url, scheme, allow_fragments, type(url), type(scheme)
458    cached = _parse_cache.get(key, None)
459    if cached:
460        return _coerce_result(cached)
461    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
462        clear_cache()
463    netloc = query = fragment = ''
464    i = url.find(':')
465    if i > 0:
466        for c in url[:i]:
467            if c not in scheme_chars:
468                break
469        else:
470            scheme, url = url[:i].lower(), url[i+1:]
471
472    if url[:2] == '//':
473        netloc, url = _splitnetloc(url, 2)
474        if (('[' in netloc and ']' not in netloc) or
475                (']' in netloc and '[' not in netloc)):
476            raise ValueError("Invalid IPv6 URL")
477    if allow_fragments and '#' in url:
478        url, fragment = url.split('#', 1)
479    if '?' in url:
480        url, query = url.split('?', 1)
481    _checknetloc(netloc)
482    v = SplitResult(scheme, netloc, url, query, fragment)
483    _parse_cache[key] = v
484    return _coerce_result(v)
485
486def urlunparse(components):
487    """Put a parsed URL back together again.  This may result in a
488    slightly different, but equivalent URL, if the URL that was parsed
489    originally had redundant delimiters, e.g. a ? with an empty query
490    (the draft states that these are equivalent)."""
491    scheme, netloc, url, params, query, fragment, _coerce_result = (
492                                                  _coerce_args(*components))
493    if params:
494        url = "%s;%s" % (url, params)
495    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
496
497def urlunsplit(components):
498    """Combine the elements of a tuple as returned by urlsplit() into a
499    complete URL as a string. The data argument can be any five-item iterable.
500    This may result in a slightly different, but equivalent URL, if the URL that
501    was parsed originally had unnecessary delimiters (for example, a ? with an
502    empty query; the RFC states that these are equivalent)."""
503    scheme, netloc, url, query, fragment, _coerce_result = (
504                                          _coerce_args(*components))
505    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
506        if url and url[:1] != '/': url = '/' + url
507        url = '//' + (netloc or '') + url
508    if scheme:
509        url = scheme + ':' + url
510    if query:
511        url = url + '?' + query
512    if fragment:
513        url = url + '#' + fragment
514    return _coerce_result(url)
515
516def urljoin(base, url, allow_fragments=True):
517    """Join a base URL and a possibly relative URL to form an absolute
518    interpretation of the latter."""
519    if not base:
520        return url
521    if not url:
522        return base
523
524    base, url, _coerce_result = _coerce_args(base, url)
525    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
526            urlparse(base, '', allow_fragments)
527    scheme, netloc, path, params, query, fragment = \
528            urlparse(url, bscheme, allow_fragments)
529
530    if scheme != bscheme or scheme not in uses_relative:
531        return _coerce_result(url)
532    if scheme in uses_netloc:
533        if netloc:
534            return _coerce_result(urlunparse((scheme, netloc, path,
535                                              params, query, fragment)))
536        netloc = bnetloc
537
538    if not path and not params:
539        path = bpath
540        params = bparams
541        if not query:
542            query = bquery
543        return _coerce_result(urlunparse((scheme, netloc, path,
544                                          params, query, fragment)))
545
546    base_parts = bpath.split('/')
547    if base_parts[-1] != '':
548        # the last item is not a directory, so will not be taken into account
549        # in resolving the relative path
550        del base_parts[-1]
551
552    # for rfc3986, ignore all base path should the first character be root.
553    if path[:1] == '/':
554        segments = path.split('/')
555    else:
556        segments = base_parts + path.split('/')
557        # filter out elements that would cause redundant slashes on re-joining
558        # the resolved_path
559        segments[1:-1] = filter(None, segments[1:-1])
560
561    resolved_path = []
562
563    for seg in segments:
564        if seg == '..':
565            try:
566                resolved_path.pop()
567            except IndexError:
568                # ignore any .. segments that would otherwise cause an IndexError
569                # when popped from resolved_path if resolving for rfc3986
570                pass
571        elif seg == '.':
572            continue
573        else:
574            resolved_path.append(seg)
575
576    if segments[-1] in ('.', '..'):
577        # do some post-processing here. if the last segment was a relative dir,
578        # then we need to append the trailing '/'
579        resolved_path.append('')
580
581    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
582        resolved_path) or '/', params, query, fragment)))
583
584
585def urldefrag(url):
586    """Removes any existing fragment from URL.
587
588    Returns a tuple of the defragmented URL and the fragment.  If
589    the URL contained no fragments, the second element is the
590    empty string.
591    """
592    url, _coerce_result = _coerce_args(url)
593    if '#' in url:
594        s, n, p, a, q, frag = urlparse(url)
595        defrag = urlunparse((s, n, p, a, q, ''))
596    else:
597        frag = ''
598        defrag = url
599    return _coerce_result(DefragResult(defrag, frag))
600
601_hexdig = '0123456789ABCDEFabcdef'
602_hextobyte = None
603
604def unquote_to_bytes(string):
605    """unquote_to_bytes('abc%20def') -> b'abc def'."""
606    # Note: strings are encoded as UTF-8. This is only an issue if it contains
607    # unescaped non-ASCII characters, which URIs should not.
608    if not string:
609        # Is it a string-like object?
610        string.split
611        return b''
612    if isinstance(string, str):
613        string = string.encode('utf-8')
614    bits = string.split(b'%')
615    if len(bits) == 1:
616        return string
617    res = [bits[0]]
618    append = res.append
619    # Delay the initialization of the table to not waste memory
620    # if the function is never called
621    global _hextobyte
622    if _hextobyte is None:
623        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
624                      for a in _hexdig for b in _hexdig}
625    for item in bits[1:]:
626        try:
627            append(_hextobyte[item[:2]])
628            append(item[2:])
629        except KeyError:
630            append(b'%')
631            append(item)
632    return b''.join(res)
633
634_asciire = re.compile('([\x00-\x7f]+)')
635
636def unquote(string, encoding='utf-8', errors='replace'):
637    """Replace %xx escapes by their single-character equivalent. The optional
638    encoding and errors parameters specify how to decode percent-encoded
639    sequences into Unicode characters, as accepted by the bytes.decode()
640    method.
641    By default, percent-encoded sequences are decoded with UTF-8, and invalid
642    sequences are replaced by a placeholder character.
643
644    unquote('abc%20def') -> 'abc def'.
645    """
646    if isinstance(string, bytes):
647        return unquote_to_bytes(string).decode(encoding, errors)
648    if '%' not in string:
649        string.split
650        return string
651    if encoding is None:
652        encoding = 'utf-8'
653    if errors is None:
654        errors = 'replace'
655    bits = _asciire.split(string)
656    res = [bits[0]]
657    append = res.append
658    for i in range(1, len(bits), 2):
659        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
660        append(bits[i + 1])
661    return ''.join(res)
662
663
664def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
665             encoding='utf-8', errors='replace', max_num_fields=None):
666    """Parse a query given as a string argument.
667
668        Arguments:
669
670        qs: percent-encoded query string to be parsed
671
672        keep_blank_values: flag indicating whether blank values in
673            percent-encoded queries should be treated as blank strings.
674            A true value indicates that blanks should be retained as
675            blank strings.  The default false value indicates that
676            blank values are to be ignored and treated as if they were
677            not included.
678
679        strict_parsing: flag indicating what to do with parsing errors.
680            If false (the default), errors are silently ignored.
681            If true, errors raise a ValueError exception.
682
683        encoding and errors: specify how to decode percent-encoded sequences
684            into Unicode characters, as accepted by the bytes.decode() method.
685
686        max_num_fields: int. If set, then throws a ValueError if there
687            are more than n fields read by parse_qsl().
688
689        Returns a dictionary.
690    """
691    parsed_result = {}
692    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
693                      encoding=encoding, errors=errors,
694                      max_num_fields=max_num_fields)
695    for name, value in pairs:
696        if name in parsed_result:
697            parsed_result[name].append(value)
698        else:
699            parsed_result[name] = [value]
700    return parsed_result
701
702
703def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
704              encoding='utf-8', errors='replace', max_num_fields=None):
705    """Parse a query given as a string argument.
706
707        Arguments:
708
709        qs: percent-encoded query string to be parsed
710
711        keep_blank_values: flag indicating whether blank values in
712            percent-encoded queries should be treated as blank strings.
713            A true value indicates that blanks should be retained as blank
714            strings.  The default false value indicates that blank values
715            are to be ignored and treated as if they were  not included.
716
717        strict_parsing: flag indicating what to do with parsing errors. If
718            false (the default), errors are silently ignored. If true,
719            errors raise a ValueError exception.
720
721        encoding and errors: specify how to decode percent-encoded sequences
722            into Unicode characters, as accepted by the bytes.decode() method.
723
724        max_num_fields: int. If set, then throws a ValueError
725            if there are more than n fields read by parse_qsl().
726
727        Returns a list, as G-d intended.
728    """
729    qs, _coerce_result = _coerce_args(qs)
730
731    # If max_num_fields is defined then check that the number of fields
732    # is less than max_num_fields. This prevents a memory exhaustion DOS
733    # attack via post bodies with many fields.
734    if max_num_fields is not None:
735        num_fields = 1 + qs.count('&') + qs.count(';')
736        if max_num_fields < num_fields:
737            raise ValueError('Max number of fields exceeded')
738
739    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
740    r = []
741    for name_value in pairs:
742        if not name_value and not strict_parsing:
743            continue
744        nv = name_value.split('=', 1)
745        if len(nv) != 2:
746            if strict_parsing:
747                raise ValueError("bad query field: %r" % (name_value,))
748            # Handle case of a control-name with no equal sign
749            if keep_blank_values:
750                nv.append('')
751            else:
752                continue
753        if len(nv[1]) or keep_blank_values:
754            name = nv[0].replace('+', ' ')
755            name = unquote(name, encoding=encoding, errors=errors)
756            name = _coerce_result(name)
757            value = nv[1].replace('+', ' ')
758            value = unquote(value, encoding=encoding, errors=errors)
759            value = _coerce_result(value)
760            r.append((name, value))
761    return r
762
763def unquote_plus(string, encoding='utf-8', errors='replace'):
764    """Like unquote(), but also replace plus signs by spaces, as required for
765    unquoting HTML form values.
766
767    unquote_plus('%7e/abc+def') -> '~/abc def'
768    """
769    string = string.replace('+', ' ')
770    return unquote(string, encoding, errors)
771
772_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
773                         b'abcdefghijklmnopqrstuvwxyz'
774                         b'0123456789'
775                         b'_.-~')
776_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
777_safe_quoters = {}
778
779class Quoter(collections.defaultdict):
780    """A mapping from bytes (in range(0,256)) to strings.
781
782    String values are percent-encoded byte values, unless the key < 128, and
783    in the "safe" set (either the specified safe set, or default set).
784    """
785    # Keeps a cache internally, using defaultdict, for efficiency (lookups
786    # of cached keys don't call Python code at all).
787    def __init__(self, safe):
788        """safe: bytes object."""
789        self.safe = _ALWAYS_SAFE.union(safe)
790
791    def __repr__(self):
792        # Without this, will just display as a defaultdict
793        return "<%s %r>" % (self.__class__.__name__, dict(self))
794
795    def __missing__(self, b):
796        # Handle a cache miss. Store quoted string in cache and return.
797        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
798        self[b] = res
799        return res
800
801def quote(string, safe='/', encoding=None, errors=None):
802    """quote('abc def') -> 'abc%20def'
803
804    Each part of a URL, e.g. the path info, the query, etc., has a
805    different set of reserved characters that must be quoted. The
806    quote function offers a cautious (not minimal) way to quote a
807    string for most of these parts.
808
809    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
810    the following (un)reserved characters.
811
812    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
813    reserved      = gen-delims / sub-delims
814    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
815    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
816                  / "*" / "+" / "," / ";" / "="
817
818    Each of the reserved characters is reserved in some component of a URL,
819    but not necessarily in all of them.
820
821    The quote function %-escapes all characters that are neither in the
822    unreserved chars ("always safe") nor the additional chars set via the
823    safe arg.
824
825    The default for the safe arg is '/'. The character is reserved, but in
826    typical usage the quote function is being called on a path where the
827    existing slash characters are to be preserved.
828
829    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
830    Now, "~" is included in the set of unreserved characters.
831
832    string and safe may be either str or bytes objects. encoding and errors
833    must not be specified if string is a bytes object.
834
835    The optional encoding and errors parameters specify how to deal with
836    non-ASCII characters, as accepted by the str.encode method.
837    By default, encoding='utf-8' (characters are encoded with UTF-8), and
838    errors='strict' (unsupported characters raise a UnicodeEncodeError).
839    """
840    if isinstance(string, str):
841        if not string:
842            return string
843        if encoding is None:
844            encoding = 'utf-8'
845        if errors is None:
846            errors = 'strict'
847        string = string.encode(encoding, errors)
848    else:
849        if encoding is not None:
850            raise TypeError("quote() doesn't support 'encoding' for bytes")
851        if errors is not None:
852            raise TypeError("quote() doesn't support 'errors' for bytes")
853    return quote_from_bytes(string, safe)
854
855def quote_plus(string, safe='', encoding=None, errors=None):
856    """Like quote(), but also replace ' ' with '+', as required for quoting
857    HTML form values. Plus signs in the original string are escaped unless
858    they are included in safe. It also does not have safe default to '/'.
859    """
860    # Check if ' ' in string, where string may either be a str or bytes.  If
861    # there are no spaces, the regular quote will produce the right answer.
862    if ((isinstance(string, str) and ' ' not in string) or
863        (isinstance(string, bytes) and b' ' not in string)):
864        return quote(string, safe, encoding, errors)
865    if isinstance(safe, str):
866        space = ' '
867    else:
868        space = b' '
869    string = quote(string, safe + space, encoding, errors)
870    return string.replace(' ', '+')
871
872def quote_from_bytes(bs, safe='/'):
873    """Like quote(), but accepts a bytes object rather than a str, and does
874    not perform string-to-bytes encoding.  It always returns an ASCII string.
875    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
876    """
877    if not isinstance(bs, (bytes, bytearray)):
878        raise TypeError("quote_from_bytes() expected bytes")
879    if not bs:
880        return ''
881    if isinstance(safe, str):
882        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
883        safe = safe.encode('ascii', 'ignore')
884    else:
885        safe = bytes([c for c in safe if c < 128])
886    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
887        return bs.decode()
888    try:
889        quoter = _safe_quoters[safe]
890    except KeyError:
891        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
892    return ''.join([quoter(char) for char in bs])
893
894def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
895              quote_via=quote_plus):
896    """Encode a dict or sequence of two-element tuples into a URL query string.
897
898    If any values in the query arg are sequences and doseq is true, each
899    sequence element is converted to a separate parameter.
900
901    If the query arg is a sequence of two-element tuples, the order of the
902    parameters in the output will match the order of parameters in the
903    input.
904
905    The components of a query arg may each be either a string or a bytes type.
906
907    The safe, encoding, and errors parameters are passed down to the function
908    specified by quote_via (encoding and errors only if a component is a str).
909    """
910
911    if hasattr(query, "items"):
912        query = query.items()
913    else:
914        # It's a bother at times that strings and string-like objects are
915        # sequences.
916        try:
917            # non-sequence items should not work with len()
918            # non-empty strings will fail this
919            if len(query) and not isinstance(query[0], tuple):
920                raise TypeError
921            # Zero-length sequences of all types will get here and succeed,
922            # but that's a minor nit.  Since the original implementation
923            # allowed empty dicts that type of behavior probably should be
924            # preserved for consistency
925        except TypeError:
926            ty, va, tb = sys.exc_info()
927            raise TypeError("not a valid non-string sequence "
928                            "or mapping object").with_traceback(tb)
929
930    l = []
931    if not doseq:
932        for k, v in query:
933            if isinstance(k, bytes):
934                k = quote_via(k, safe)
935            else:
936                k = quote_via(str(k), safe, encoding, errors)
937
938            if isinstance(v, bytes):
939                v = quote_via(v, safe)
940            else:
941                v = quote_via(str(v), safe, encoding, errors)
942            l.append(k + '=' + v)
943    else:
944        for k, v in query:
945            if isinstance(k, bytes):
946                k = quote_via(k, safe)
947            else:
948                k = quote_via(str(k), safe, encoding, errors)
949
950            if isinstance(v, bytes):
951                v = quote_via(v, safe)
952                l.append(k + '=' + v)
953            elif isinstance(v, str):
954                v = quote_via(v, safe, encoding, errors)
955                l.append(k + '=' + v)
956            else:
957                try:
958                    # Is this a sufficient test for sequence-ness?
959                    x = len(v)
960                except TypeError:
961                    # not a sequence
962                    v = quote_via(str(v), safe, encoding, errors)
963                    l.append(k + '=' + v)
964                else:
965                    # loop over the sequence
966                    for elt in v:
967                        if isinstance(elt, bytes):
968                            elt = quote_via(elt, safe)
969                        else:
970                            elt = quote_via(str(elt), safe, encoding, errors)
971                        l.append(k + '=' + elt)
972    return '&'.join(l)
973
974
975def to_bytes(url):
976    warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
977                  DeprecationWarning, stacklevel=2)
978    return _to_bytes(url)
979
980
981def _to_bytes(url):
982    """to_bytes(u"URL") --> 'URL'."""
983    # Most URL schemes require ASCII. If that changes, the conversion
984    # can be relaxed.
985    # XXX get rid of to_bytes()
986    if isinstance(url, str):
987        try:
988            url = url.encode("ASCII").decode()
989        except UnicodeError:
990            raise UnicodeError("URL " + repr(url) +
991                               " contains non-ASCII characters")
992    return url
993
994
995def unwrap(url):
996    """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
997
998    The string is returned unchanged if it's not a wrapped URL.
999    """
1000    url = str(url).strip()
1001    if url[:1] == '<' and url[-1:] == '>':
1002        url = url[1:-1].strip()
1003    if url[:4] == 'URL:':
1004        url = url[4:].strip()
1005    return url
1006
1007
1008def splittype(url):
1009    warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1010                  "use urllib.parse.urlparse() instead",
1011                  DeprecationWarning, stacklevel=2)
1012    return _splittype(url)
1013
1014
1015_typeprog = None
1016def _splittype(url):
1017    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1018    global _typeprog
1019    if _typeprog is None:
1020        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
1021
1022    match = _typeprog.match(url)
1023    if match:
1024        scheme, data = match.groups()
1025        return scheme.lower(), data
1026    return None, url
1027
1028
1029def splithost(url):
1030    warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1031                  "use urllib.parse.urlparse() instead",
1032                  DeprecationWarning, stacklevel=2)
1033    return _splithost(url)
1034
1035
1036_hostprog = None
1037def _splithost(url):
1038    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1039    global _hostprog
1040    if _hostprog is None:
1041        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1042
1043    match = _hostprog.match(url)
1044    if match:
1045        host_port, path = match.groups()
1046        if path and path[0] != '/':
1047            path = '/' + path
1048        return host_port, path
1049    return None, url
1050
1051
1052def splituser(host):
1053    warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1054                  "use urllib.parse.urlparse() instead",
1055                  DeprecationWarning, stacklevel=2)
1056    return _splituser(host)
1057
1058
1059def _splituser(host):
1060    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1061    user, delim, host = host.rpartition('@')
1062    return (user if delim else None), host
1063
1064
1065def splitpasswd(user):
1066    warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1067                  "use urllib.parse.urlparse() instead",
1068                  DeprecationWarning, stacklevel=2)
1069    return _splitpasswd(user)
1070
1071
1072def _splitpasswd(user):
1073    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1074    user, delim, passwd = user.partition(':')
1075    return user, (passwd if delim else None)
1076
1077
1078def splitport(host):
1079    warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1080                  "use urllib.parse.urlparse() instead",
1081                  DeprecationWarning, stacklevel=2)
1082    return _splitport(host)
1083
1084
1085# splittag('/path#tag') --> '/path', 'tag'
1086_portprog = None
1087def _splitport(host):
1088    """splitport('host:port') --> 'host', 'port'."""
1089    global _portprog
1090    if _portprog is None:
1091        _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1092
1093    match = _portprog.fullmatch(host)
1094    if match:
1095        host, port = match.groups()
1096        if port:
1097            return host, port
1098    return host, None
1099
1100
1101def splitnport(host, defport=-1):
1102    warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1103                  "use urllib.parse.urlparse() instead",
1104                  DeprecationWarning, stacklevel=2)
1105    return _splitnport(host, defport)
1106
1107
1108def _splitnport(host, defport=-1):
1109    """Split host and port, returning numeric port.
1110    Return given default port if no ':' found; defaults to -1.
1111    Return numerical port if a valid number are found after ':'.
1112    Return None if ':' but not a valid number."""
1113    host, delim, port = host.rpartition(':')
1114    if not delim:
1115        host = port
1116    elif port:
1117        try:
1118            nport = int(port)
1119        except ValueError:
1120            nport = None
1121        return host, nport
1122    return host, defport
1123
1124
1125def splitquery(url):
1126    warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1127                  "use urllib.parse.urlparse() instead",
1128                  DeprecationWarning, stacklevel=2)
1129    return _splitquery(url)
1130
1131
1132def _splitquery(url):
1133    """splitquery('/path?query') --> '/path', 'query'."""
1134    path, delim, query = url.rpartition('?')
1135    if delim:
1136        return path, query
1137    return url, None
1138
1139
1140def splittag(url):
1141    warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1142                  "use urllib.parse.urlparse() instead",
1143                  DeprecationWarning, stacklevel=2)
1144    return _splittag(url)
1145
1146
1147def _splittag(url):
1148    """splittag('/path#tag') --> '/path', 'tag'."""
1149    path, delim, tag = url.rpartition('#')
1150    if delim:
1151        return path, tag
1152    return url, None
1153
1154
1155def splitattr(url):
1156    warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1157                  "use urllib.parse.urlparse() instead",
1158                  DeprecationWarning, stacklevel=2)
1159    return _splitattr(url)
1160
1161
1162def _splitattr(url):
1163    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1164        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1165    words = url.split(';')
1166    return words[0], words[1:]
1167
1168
1169def splitvalue(attr):
1170    warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1171                  "use urllib.parse.parse_qsl() instead",
1172                  DeprecationWarning, stacklevel=2)
1173    return _splitvalue(attr)
1174
1175
1176def _splitvalue(attr):
1177    """splitvalue('attr=value') --> 'attr', 'value'."""
1178    attr, delim, value = attr.partition('=')
1179    return attr, (value if delim else None)
1180