• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28
29The WHATWG URL Parser spec should also be considered.  We are not compliant with
30it either due to existing user code API behavior expectations (Hyrum's Law).
31It serves as a useful guide when making changes.
32"""
33
34from collections import namedtuple
35import functools
36import math
37import re
38import types
39import warnings
40import ipaddress
41
42__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
43           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
44           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
45           "unquote", "unquote_plus", "unquote_to_bytes",
46           "DefragResult", "ParseResult", "SplitResult",
47           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
48
49# A classification of schemes.
50# The empty string classifies URLs with no scheme specified,
51# being the default value returned by “urlsplit” and “urlparse”.
52
53uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
54                 'wais', 'file', 'https', 'shttp', 'mms',
55                 'prospero', 'rtsp', 'rtsps', 'rtspu', 'sftp',
56                 'svn', 'svn+ssh', 'ws', 'wss']
57
58uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
59               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
60               'snews', 'prospero', 'rtsp', 'rtsps', 'rtspu', 'rsync',
61               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
62               'ws', 'wss', 'itms-services']
63
64uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
65               'https', 'shttp', 'rtsp', 'rtsps', 'rtspu', 'sip',
66               'sips', 'mms', 'sftp', 'tel']
67
68# These are not actually used anymore, but should stay for backwards
69# compatibility.  (They are undocumented, but have a public-looking name.)
70
71non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
72                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
73
74uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
75              'gopher', 'rtsp', 'rtsps', 'rtspu', 'sip', 'sips']
76
77uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
78                 'nntp', 'wais', 'https', 'shttp', 'snews',
79                 'file', 'prospero']
80
81# Characters valid in scheme names
82scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
83                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
84                '0123456789'
85                '+-.')
86
87# Leading and trailing C0 control and space to be stripped per WHATWG spec.
88# == "".join([chr(i) for i in range(0, 0x20 + 1)])
89_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
90
91# Unsafe bytes to be removed per WHATWG spec
92_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
93
94def clear_cache():
95    """Clear internal performance caches. Undocumented; some tests want it."""
96    urlsplit.cache_clear()
97    _byte_quoter_factory.cache_clear()
98
99# Helpers for bytes handling
100# For 3.2, we deliberately require applications that
101# handle improperly quoted URLs to do their own
102# decoding and encoding. If valid use cases are
103# presented, we may relax this by using latin-1
104# decoding internally for 3.3
105_implicit_encoding = 'ascii'
106_implicit_errors = 'strict'
107
108def _noop(obj):
109    return obj
110
111def _encode_result(obj, encoding=_implicit_encoding,
112                        errors=_implicit_errors):
113    return obj.encode(encoding, errors)
114
115def _decode_args(args, encoding=_implicit_encoding,
116                       errors=_implicit_errors):
117    return tuple(x.decode(encoding, errors) if x else '' for x in args)
118
119def _coerce_args(*args):
120    # Invokes decode if necessary to create str args
121    # and returns the coerced inputs along with
122    # an appropriate result coercion function
123    #   - noop for str inputs
124    #   - encoding function otherwise
125    str_input = isinstance(args[0], str)
126    for arg in args[1:]:
127        # We special-case the empty string to support the
128        # "scheme=''" default argument to some functions
129        if arg and isinstance(arg, str) != str_input:
130            raise TypeError("Cannot mix str and non-str arguments")
131    if str_input:
132        return args + (_noop,)
133    return _decode_args(args) + (_encode_result,)
134
135# Result objects are more helpful than simple tuples
136class _ResultMixinStr(object):
137    """Standard approach to encoding parsed results from str to bytes"""
138    __slots__ = ()
139
140    def encode(self, encoding='ascii', errors='strict'):
141        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
142
143
144class _ResultMixinBytes(object):
145    """Standard approach to decoding parsed results from bytes to str"""
146    __slots__ = ()
147
148    def decode(self, encoding='ascii', errors='strict'):
149        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
150
151
152class _NetlocResultMixinBase(object):
153    """Shared methods for the parsed result objects containing a netloc element"""
154    __slots__ = ()
155
156    @property
157    def username(self):
158        return self._userinfo[0]
159
160    @property
161    def password(self):
162        return self._userinfo[1]
163
164    @property
165    def hostname(self):
166        hostname = self._hostinfo[0]
167        if not hostname:
168            return None
169        # Scoped IPv6 address may have zone info, which must not be lowercased
170        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
171        separator = '%' if isinstance(hostname, str) else b'%'
172        hostname, percent, zone = hostname.partition(separator)
173        return hostname.lower() + percent + zone
174
175    @property
176    def port(self):
177        port = self._hostinfo[1]
178        if port is not None:
179            if port.isdigit() and port.isascii():
180                port = int(port)
181            else:
182                raise ValueError(f"Port could not be cast to integer value as {port!r}")
183            if not (0 <= port <= 65535):
184                raise ValueError("Port out of range 0-65535")
185        return port
186
187    __class_getitem__ = classmethod(types.GenericAlias)
188
189
190class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
191    __slots__ = ()
192
193    @property
194    def _userinfo(self):
195        netloc = self.netloc
196        userinfo, have_info, hostinfo = netloc.rpartition('@')
197        if have_info:
198            username, have_password, password = userinfo.partition(':')
199            if not have_password:
200                password = None
201        else:
202            username = password = None
203        return username, password
204
205    @property
206    def _hostinfo(self):
207        netloc = self.netloc
208        _, _, hostinfo = netloc.rpartition('@')
209        _, have_open_br, bracketed = hostinfo.partition('[')
210        if have_open_br:
211            hostname, _, port = bracketed.partition(']')
212            _, _, port = port.partition(':')
213        else:
214            hostname, _, port = hostinfo.partition(':')
215        if not port:
216            port = None
217        return hostname, port
218
219
220class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
221    __slots__ = ()
222
223    @property
224    def _userinfo(self):
225        netloc = self.netloc
226        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
227        if have_info:
228            username, have_password, password = userinfo.partition(b':')
229            if not have_password:
230                password = None
231        else:
232            username = password = None
233        return username, password
234
235    @property
236    def _hostinfo(self):
237        netloc = self.netloc
238        _, _, hostinfo = netloc.rpartition(b'@')
239        _, have_open_br, bracketed = hostinfo.partition(b'[')
240        if have_open_br:
241            hostname, _, port = bracketed.partition(b']')
242            _, _, port = port.partition(b':')
243        else:
244            hostname, _, port = hostinfo.partition(b':')
245        if not port:
246            port = None
247        return hostname, port
248
249
250_DefragResultBase = namedtuple('DefragResult', 'url fragment')
251_SplitResultBase = namedtuple(
252    'SplitResult', 'scheme netloc path query fragment')
253_ParseResultBase = namedtuple(
254    'ParseResult', 'scheme netloc path params query fragment')
255
256_DefragResultBase.__doc__ = """
257DefragResult(url, fragment)
258
259A 2-tuple that contains the url without fragment identifier and the fragment
260identifier as a separate argument.
261"""
262
263_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
264
265_DefragResultBase.fragment.__doc__ = """
266Fragment identifier separated from URL, that allows indirect identification of a
267secondary resource by reference to a primary resource and additional identifying
268information.
269"""
270
271_SplitResultBase.__doc__ = """
272SplitResult(scheme, netloc, path, query, fragment)
273
274A 5-tuple that contains the different components of a URL. Similar to
275ParseResult, but does not split params.
276"""
277
278_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
279
280_SplitResultBase.netloc.__doc__ = """
281Network location where the request is made to.
282"""
283
284_SplitResultBase.path.__doc__ = """
285The hierarchical path, such as the path to a file to download.
286"""
287
288_SplitResultBase.query.__doc__ = """
289The query component, that contains non-hierarchical data, that along with data
290in path component, identifies a resource in the scope of URI's scheme and
291network location.
292"""
293
294_SplitResultBase.fragment.__doc__ = """
295Fragment identifier, that allows indirect identification of a secondary resource
296by reference to a primary resource and additional identifying information.
297"""
298
299_ParseResultBase.__doc__ = """
300ParseResult(scheme, netloc, path, params, query, fragment)
301
302A 6-tuple that contains components of a parsed URL.
303"""
304
305_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
306_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
307_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
308_ParseResultBase.params.__doc__ = """
309Parameters for last path element used to dereference the URI in order to provide
310access to perform some operation on the resource.
311"""
312
313_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
314_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
315
316
317# For backwards compatibility, alias _NetlocResultMixinStr
318# ResultBase is no longer part of the documented API, but it is
319# retained since deprecating it isn't worth the hassle
320ResultBase = _NetlocResultMixinStr
321
322# Structured result objects for string data
323class DefragResult(_DefragResultBase, _ResultMixinStr):
324    __slots__ = ()
325    def geturl(self):
326        if self.fragment:
327            return self.url + '#' + self.fragment
328        else:
329            return self.url
330
331class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
332    __slots__ = ()
333    def geturl(self):
334        return urlunsplit(self)
335
336class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
337    __slots__ = ()
338    def geturl(self):
339        return urlunparse(self)
340
341# Structured result objects for bytes data
342class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
343    __slots__ = ()
344    def geturl(self):
345        if self.fragment:
346            return self.url + b'#' + self.fragment
347        else:
348            return self.url
349
350class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
351    __slots__ = ()
352    def geturl(self):
353        return urlunsplit(self)
354
355class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
356    __slots__ = ()
357    def geturl(self):
358        return urlunparse(self)
359
360# Set up the encode/decode result pairs
361def _fix_result_transcoding():
362    _result_pairs = (
363        (DefragResult, DefragResultBytes),
364        (SplitResult, SplitResultBytes),
365        (ParseResult, ParseResultBytes),
366    )
367    for _decoded, _encoded in _result_pairs:
368        _decoded._encoded_counterpart = _encoded
369        _encoded._decoded_counterpart = _decoded
370
371_fix_result_transcoding()
372del _fix_result_transcoding
373
374def urlparse(url, scheme='', allow_fragments=True):
375    """Parse a URL into 6 components:
376    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
377
378    The result is a named 6-tuple with fields corresponding to the
379    above. It is either a ParseResult or ParseResultBytes object,
380    depending on the type of the url parameter.
381
382    The username, password, hostname, and port sub-components of netloc
383    can also be accessed as attributes of the returned object.
384
385    The scheme argument provides the default value of the scheme
386    component when no scheme is found in url.
387
388    If allow_fragments is False, no attempt is made to separate the
389    fragment component from the previous component, which can be either
390    path or query.
391
392    Note that % escapes are not expanded.
393    """
394    url, scheme, _coerce_result = _coerce_args(url, scheme)
395    splitresult = urlsplit(url, scheme, allow_fragments)
396    scheme, netloc, url, query, fragment = splitresult
397    if scheme in uses_params and ';' in url:
398        url, params = _splitparams(url)
399    else:
400        params = ''
401    result = ParseResult(scheme, netloc, url, params, query, fragment)
402    return _coerce_result(result)
403
404def _splitparams(url):
405    if '/'  in url:
406        i = url.find(';', url.rfind('/'))
407        if i < 0:
408            return url, ''
409    else:
410        i = url.find(';')
411    return url[:i], url[i+1:]
412
413def _splitnetloc(url, start=0):
414    delim = len(url)   # position of end of domain part of url, default is end
415    for c in '/?#':    # look for delimiters; the order is NOT important
416        wdelim = url.find(c, start)        # find first of this delim
417        if wdelim >= 0:                    # if found
418            delim = min(delim, wdelim)     # use earliest delim position
419    return url[start:delim], url[delim:]   # return (domain, rest)
420
421def _checknetloc(netloc):
422    if not netloc or netloc.isascii():
423        return
424    # looking for characters like \u2100 that expand to 'a/c'
425    # IDNA uses NFKC equivalence, so normalize for this check
426    import unicodedata
427    n = netloc.replace('@', '')   # ignore characters already included
428    n = n.replace(':', '')        # but not the surrounding text
429    n = n.replace('#', '')
430    n = n.replace('?', '')
431    netloc2 = unicodedata.normalize('NFKC', n)
432    if n == netloc2:
433        return
434    for c in '/?#@:':
435        if c in netloc2:
436            raise ValueError("netloc '" + netloc + "' contains invalid " +
437                             "characters under NFKC normalization")
438
439# Valid bracketed hosts are defined in
440# https://www.rfc-editor.org/rfc/rfc3986#page-49 and https://url.spec.whatwg.org/
441def _check_bracketed_host(hostname):
442    if hostname.startswith('v'):
443        if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", hostname):
444            raise ValueError(f"IPvFuture address is invalid")
445    else:
446        ip = ipaddress.ip_address(hostname) # Throws Value Error if not IPv6 or IPv4
447        if isinstance(ip, ipaddress.IPv4Address):
448            raise ValueError(f"An IPv4 address cannot be in brackets")
449
450# typed=True avoids BytesWarnings being emitted during cache key
451# comparison since this API supports both bytes and str input.
452@functools.lru_cache(typed=True)
453def urlsplit(url, scheme='', allow_fragments=True):
454    """Parse a URL into 5 components:
455    <scheme>://<netloc>/<path>?<query>#<fragment>
456
457    The result is a named 5-tuple with fields corresponding to the
458    above. It is either a SplitResult or SplitResultBytes object,
459    depending on the type of the url parameter.
460
461    The username, password, hostname, and port sub-components of netloc
462    can also be accessed as attributes of the returned object.
463
464    The scheme argument provides the default value of the scheme
465    component when no scheme is found in url.
466
467    If allow_fragments is False, no attempt is made to separate the
468    fragment component from the previous component, which can be either
469    path or query.
470
471    Note that % escapes are not expanded.
472    """
473
474    url, scheme, _coerce_result = _coerce_args(url, scheme)
475    # Only lstrip url as some applications rely on preserving trailing space.
476    # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
477    url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
478    scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
479
480    for b in _UNSAFE_URL_BYTES_TO_REMOVE:
481        url = url.replace(b, "")
482        scheme = scheme.replace(b, "")
483
484    allow_fragments = bool(allow_fragments)
485    netloc = query = fragment = ''
486    i = url.find(':')
487    if i > 0 and url[0].isascii() and url[0].isalpha():
488        for c in url[:i]:
489            if c not in scheme_chars:
490                break
491        else:
492            scheme, url = url[:i].lower(), url[i+1:]
493    if url[:2] == '//':
494        netloc, url = _splitnetloc(url, 2)
495        if (('[' in netloc and ']' not in netloc) or
496                (']' in netloc and '[' not in netloc)):
497            raise ValueError("Invalid IPv6 URL")
498        if '[' in netloc and ']' in netloc:
499            bracketed_host = netloc.partition('[')[2].partition(']')[0]
500            _check_bracketed_host(bracketed_host)
501    if allow_fragments and '#' in url:
502        url, fragment = url.split('#', 1)
503    if '?' in url:
504        url, query = url.split('?', 1)
505    _checknetloc(netloc)
506    v = SplitResult(scheme, netloc, url, query, fragment)
507    return _coerce_result(v)
508
509def urlunparse(components):
510    """Put a parsed URL back together again.  This may result in a
511    slightly different, but equivalent URL, if the URL that was parsed
512    originally had redundant delimiters, e.g. a ? with an empty query
513    (the draft states that these are equivalent)."""
514    scheme, netloc, url, params, query, fragment, _coerce_result = (
515                                                  _coerce_args(*components))
516    if params:
517        url = "%s;%s" % (url, params)
518    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
519
520def urlunsplit(components):
521    """Combine the elements of a tuple as returned by urlsplit() into a
522    complete URL as a string. The data argument can be any five-item iterable.
523    This may result in a slightly different, but equivalent URL, if the URL that
524    was parsed originally had unnecessary delimiters (for example, a ? with an
525    empty query; the RFC states that these are equivalent)."""
526    scheme, netloc, url, query, fragment, _coerce_result = (
527                                          _coerce_args(*components))
528    if netloc:
529        if url and url[:1] != '/': url = '/' + url
530        url = '//' + netloc + url
531    elif url[:2] == '//':
532        url = '//' + url
533    elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
534        url = '//' + url
535    if scheme:
536        url = scheme + ':' + url
537    if query:
538        url = url + '?' + query
539    if fragment:
540        url = url + '#' + fragment
541    return _coerce_result(url)
542
543def urljoin(base, url, allow_fragments=True):
544    """Join a base URL and a possibly relative URL to form an absolute
545    interpretation of the latter."""
546    if not base:
547        return url
548    if not url:
549        return base
550
551    base, url, _coerce_result = _coerce_args(base, url)
552    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
553            urlparse(base, '', allow_fragments)
554    scheme, netloc, path, params, query, fragment = \
555            urlparse(url, bscheme, allow_fragments)
556
557    if scheme != bscheme or scheme not in uses_relative:
558        return _coerce_result(url)
559    if scheme in uses_netloc:
560        if netloc:
561            return _coerce_result(urlunparse((scheme, netloc, path,
562                                              params, query, fragment)))
563        netloc = bnetloc
564
565    if not path and not params:
566        path = bpath
567        params = bparams
568        if not query:
569            query = bquery
570        return _coerce_result(urlunparse((scheme, netloc, path,
571                                          params, query, fragment)))
572
573    base_parts = bpath.split('/')
574    if base_parts[-1] != '':
575        # the last item is not a directory, so will not be taken into account
576        # in resolving the relative path
577        del base_parts[-1]
578
579    # for rfc3986, ignore all base path should the first character be root.
580    if path[:1] == '/':
581        segments = path.split('/')
582    else:
583        segments = base_parts + path.split('/')
584        # filter out elements that would cause redundant slashes on re-joining
585        # the resolved_path
586        segments[1:-1] = filter(None, segments[1:-1])
587
588    resolved_path = []
589
590    for seg in segments:
591        if seg == '..':
592            try:
593                resolved_path.pop()
594            except IndexError:
595                # ignore any .. segments that would otherwise cause an IndexError
596                # when popped from resolved_path if resolving for rfc3986
597                pass
598        elif seg == '.':
599            continue
600        else:
601            resolved_path.append(seg)
602
603    if segments[-1] in ('.', '..'):
604        # do some post-processing here. if the last segment was a relative dir,
605        # then we need to append the trailing '/'
606        resolved_path.append('')
607
608    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
609        resolved_path) or '/', params, query, fragment)))
610
611
612def urldefrag(url):
613    """Removes any existing fragment from URL.
614
615    Returns a tuple of the defragmented URL and the fragment.  If
616    the URL contained no fragments, the second element is the
617    empty string.
618    """
619    url, _coerce_result = _coerce_args(url)
620    if '#' in url:
621        s, n, p, a, q, frag = urlparse(url)
622        defrag = urlunparse((s, n, p, a, q, ''))
623    else:
624        frag = ''
625        defrag = url
626    return _coerce_result(DefragResult(defrag, frag))
627
628_hexdig = '0123456789ABCDEFabcdef'
629_hextobyte = None
630
631def unquote_to_bytes(string):
632    """unquote_to_bytes('abc%20def') -> b'abc def'."""
633    return bytes(_unquote_impl(string))
634
635def _unquote_impl(string: bytes | bytearray | str) -> bytes | bytearray:
636    # Note: strings are encoded as UTF-8. This is only an issue if it contains
637    # unescaped non-ASCII characters, which URIs should not.
638    if not string:
639        # Is it a string-like object?
640        string.split
641        return b''
642    if isinstance(string, str):
643        string = string.encode('utf-8')
644    bits = string.split(b'%')
645    if len(bits) == 1:
646        return string
647    res = bytearray(bits[0])
648    append = res.extend
649    # Delay the initialization of the table to not waste memory
650    # if the function is never called
651    global _hextobyte
652    if _hextobyte is None:
653        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
654                      for a in _hexdig for b in _hexdig}
655    for item in bits[1:]:
656        try:
657            append(_hextobyte[item[:2]])
658            append(item[2:])
659        except KeyError:
660            append(b'%')
661            append(item)
662    return res
663
664_asciire = re.compile('([\x00-\x7f]+)')
665
666def _generate_unquoted_parts(string, encoding, errors):
667    previous_match_end = 0
668    for ascii_match in _asciire.finditer(string):
669        start, end = ascii_match.span()
670        yield string[previous_match_end:start]  # Non-ASCII
671        # The ascii_match[1] group == string[start:end].
672        yield _unquote_impl(ascii_match[1]).decode(encoding, errors)
673        previous_match_end = end
674    yield string[previous_match_end:]  # Non-ASCII tail
675
676def unquote(string, encoding='utf-8', errors='replace'):
677    """Replace %xx escapes by their single-character equivalent. The optional
678    encoding and errors parameters specify how to decode percent-encoded
679    sequences into Unicode characters, as accepted by the bytes.decode()
680    method.
681    By default, percent-encoded sequences are decoded with UTF-8, and invalid
682    sequences are replaced by a placeholder character.
683
684    unquote('abc%20def') -> 'abc def'.
685    """
686    if isinstance(string, bytes):
687        return _unquote_impl(string).decode(encoding, errors)
688    if '%' not in string:
689        # Is it a string-like object?
690        string.split
691        return string
692    if encoding is None:
693        encoding = 'utf-8'
694    if errors is None:
695        errors = 'replace'
696    return ''.join(_generate_unquoted_parts(string, encoding, errors))
697
698
699def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
700             encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
701    """Parse a query given as a string argument.
702
703        Arguments:
704
705        qs: percent-encoded query string to be parsed
706
707        keep_blank_values: flag indicating whether blank values in
708            percent-encoded queries should be treated as blank strings.
709            A true value indicates that blanks should be retained as
710            blank strings.  The default false value indicates that
711            blank values are to be ignored and treated as if they were
712            not included.
713
714        strict_parsing: flag indicating what to do with parsing errors.
715            If false (the default), errors are silently ignored.
716            If true, errors raise a ValueError exception.
717
718        encoding and errors: specify how to decode percent-encoded sequences
719            into Unicode characters, as accepted by the bytes.decode() method.
720
721        max_num_fields: int. If set, then throws a ValueError if there
722            are more than n fields read by parse_qsl().
723
724        separator: str. The symbol to use for separating the query arguments.
725            Defaults to &.
726
727        Returns a dictionary.
728    """
729    parsed_result = {}
730    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
731                      encoding=encoding, errors=errors,
732                      max_num_fields=max_num_fields, separator=separator)
733    for name, value in pairs:
734        if name in parsed_result:
735            parsed_result[name].append(value)
736        else:
737            parsed_result[name] = [value]
738    return parsed_result
739
740
741def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
742              encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
743    """Parse a query given as a string argument.
744
745        Arguments:
746
747        qs: percent-encoded query string to be parsed
748
749        keep_blank_values: flag indicating whether blank values in
750            percent-encoded queries should be treated as blank strings.
751            A true value indicates that blanks should be retained as blank
752            strings.  The default false value indicates that blank values
753            are to be ignored and treated as if they were  not included.
754
755        strict_parsing: flag indicating what to do with parsing errors. If
756            false (the default), errors are silently ignored. If true,
757            errors raise a ValueError exception.
758
759        encoding and errors: specify how to decode percent-encoded sequences
760            into Unicode characters, as accepted by the bytes.decode() method.
761
762        max_num_fields: int. If set, then throws a ValueError
763            if there are more than n fields read by parse_qsl().
764
765        separator: str. The symbol to use for separating the query arguments.
766            Defaults to &.
767
768        Returns a list, as G-d intended.
769    """
770
771    if not separator or not isinstance(separator, (str, bytes)):
772        raise ValueError("Separator must be of type string or bytes.")
773    if isinstance(qs, str):
774        if not isinstance(separator, str):
775            separator = str(separator, 'ascii')
776        eq = '='
777        def _unquote(s):
778            return unquote_plus(s, encoding=encoding, errors=errors)
779    else:
780        if not qs:
781            return []
782        # Use memoryview() to reject integers and iterables,
783        # acceptable by the bytes constructor.
784        qs = bytes(memoryview(qs))
785        if isinstance(separator, str):
786            separator = bytes(separator, 'ascii')
787        eq = b'='
788        def _unquote(s):
789            return unquote_to_bytes(s.replace(b'+', b' '))
790
791    if not qs:
792        return []
793
794    # If max_num_fields is defined then check that the number of fields
795    # is less than max_num_fields. This prevents a memory exhaustion DOS
796    # attack via post bodies with many fields.
797    if max_num_fields is not None:
798        num_fields = 1 + qs.count(separator)
799        if max_num_fields < num_fields:
800            raise ValueError('Max number of fields exceeded')
801
802    r = []
803    for name_value in qs.split(separator):
804        if name_value or strict_parsing:
805            name, has_eq, value = name_value.partition(eq)
806            if not has_eq and strict_parsing:
807                raise ValueError("bad query field: %r" % (name_value,))
808            if value or keep_blank_values:
809                name = _unquote(name)
810                value = _unquote(value)
811                r.append((name, value))
812    return r
813
814def unquote_plus(string, encoding='utf-8', errors='replace'):
815    """Like unquote(), but also replace plus signs by spaces, as required for
816    unquoting HTML form values.
817
818    unquote_plus('%7e/abc+def') -> '~/abc def'
819    """
820    string = string.replace('+', ' ')
821    return unquote(string, encoding, errors)
822
823_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
824                         b'abcdefghijklmnopqrstuvwxyz'
825                         b'0123456789'
826                         b'_.-~')
827_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
828
829def __getattr__(name):
830    if name == 'Quoter':
831        warnings.warn('Deprecated in 3.11. '
832                      'urllib.parse.Quoter will be removed in Python 3.14. '
833                      'It was not intended to be a public API.',
834                      DeprecationWarning, stacklevel=2)
835        return _Quoter
836    raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
837
838class _Quoter(dict):
839    """A mapping from bytes numbers (in range(0,256)) to strings.
840
841    String values are percent-encoded byte values, unless the key < 128, and
842    in either of the specified safe set, or the always safe set.
843    """
844    # Keeps a cache internally, via __missing__, for efficiency (lookups
845    # of cached keys don't call Python code at all).
846    def __init__(self, safe):
847        """safe: bytes object."""
848        self.safe = _ALWAYS_SAFE.union(safe)
849
850    def __repr__(self):
851        return f"<Quoter {dict(self)!r}>"
852
853    def __missing__(self, b):
854        # Handle a cache miss. Store quoted string in cache and return.
855        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
856        self[b] = res
857        return res
858
859def quote(string, safe='/', encoding=None, errors=None):
860    """quote('abc def') -> 'abc%20def'
861
862    Each part of a URL, e.g. the path info, the query, etc., has a
863    different set of reserved characters that must be quoted. The
864    quote function offers a cautious (not minimal) way to quote a
865    string for most of these parts.
866
867    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
868    the following (un)reserved characters.
869
870    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
871    reserved      = gen-delims / sub-delims
872    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
873    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
874                  / "*" / "+" / "," / ";" / "="
875
876    Each of the reserved characters is reserved in some component of a URL,
877    but not necessarily in all of them.
878
879    The quote function %-escapes all characters that are neither in the
880    unreserved chars ("always safe") nor the additional chars set via the
881    safe arg.
882
883    The default for the safe arg is '/'. The character is reserved, but in
884    typical usage the quote function is being called on a path where the
885    existing slash characters are to be preserved.
886
887    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
888    Now, "~" is included in the set of unreserved characters.
889
890    string and safe may be either str or bytes objects. encoding and errors
891    must not be specified if string is a bytes object.
892
893    The optional encoding and errors parameters specify how to deal with
894    non-ASCII characters, as accepted by the str.encode method.
895    By default, encoding='utf-8' (characters are encoded with UTF-8), and
896    errors='strict' (unsupported characters raise a UnicodeEncodeError).
897    """
898    if isinstance(string, str):
899        if not string:
900            return string
901        if encoding is None:
902            encoding = 'utf-8'
903        if errors is None:
904            errors = 'strict'
905        string = string.encode(encoding, errors)
906    else:
907        if encoding is not None:
908            raise TypeError("quote() doesn't support 'encoding' for bytes")
909        if errors is not None:
910            raise TypeError("quote() doesn't support 'errors' for bytes")
911    return quote_from_bytes(string, safe)
912
913def quote_plus(string, safe='', encoding=None, errors=None):
914    """Like quote(), but also replace ' ' with '+', as required for quoting
915    HTML form values. Plus signs in the original string are escaped unless
916    they are included in safe. It also does not have safe default to '/'.
917    """
918    # Check if ' ' in string, where string may either be a str or bytes.  If
919    # there are no spaces, the regular quote will produce the right answer.
920    if ((isinstance(string, str) and ' ' not in string) or
921        (isinstance(string, bytes) and b' ' not in string)):
922        return quote(string, safe, encoding, errors)
923    if isinstance(safe, str):
924        space = ' '
925    else:
926        space = b' '
927    string = quote(string, safe + space, encoding, errors)
928    return string.replace(' ', '+')
929
930# Expectation: A typical program is unlikely to create more than 5 of these.
931@functools.lru_cache
932def _byte_quoter_factory(safe):
933    return _Quoter(safe).__getitem__
934
935def quote_from_bytes(bs, safe='/'):
936    """Like quote(), but accepts a bytes object rather than a str, and does
937    not perform string-to-bytes encoding.  It always returns an ASCII string.
938    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
939    """
940    if not isinstance(bs, (bytes, bytearray)):
941        raise TypeError("quote_from_bytes() expected bytes")
942    if not bs:
943        return ''
944    if isinstance(safe, str):
945        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
946        safe = safe.encode('ascii', 'ignore')
947    else:
948        # List comprehensions are faster than generator expressions.
949        safe = bytes([c for c in safe if c < 128])
950    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
951        return bs.decode()
952    quoter = _byte_quoter_factory(safe)
953    if (bs_len := len(bs)) < 200_000:
954        return ''.join(map(quoter, bs))
955    else:
956        # This saves memory - https://github.com/python/cpython/issues/95865
957        chunk_size = math.isqrt(bs_len)
958        chunks = [''.join(map(quoter, bs[i:i+chunk_size]))
959                  for i in range(0, bs_len, chunk_size)]
960        return ''.join(chunks)
961
962def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
963              quote_via=quote_plus):
964    """Encode a dict or sequence of two-element tuples into a URL query string.
965
966    If any values in the query arg are sequences and doseq is true, each
967    sequence element is converted to a separate parameter.
968
969    If the query arg is a sequence of two-element tuples, the order of the
970    parameters in the output will match the order of parameters in the
971    input.
972
973    The components of a query arg may each be either a string or a bytes type.
974
975    The safe, encoding, and errors parameters are passed down to the function
976    specified by quote_via (encoding and errors only if a component is a str).
977    """
978
979    if hasattr(query, "items"):
980        query = query.items()
981    else:
982        # It's a bother at times that strings and string-like objects are
983        # sequences.
984        try:
985            # non-sequence items should not work with len()
986            # non-empty strings will fail this
987            if len(query) and not isinstance(query[0], tuple):
988                raise TypeError
989            # Zero-length sequences of all types will get here and succeed,
990            # but that's a minor nit.  Since the original implementation
991            # allowed empty dicts that type of behavior probably should be
992            # preserved for consistency
993        except TypeError as err:
994            raise TypeError("not a valid non-string sequence "
995                            "or mapping object") from err
996
997    l = []
998    if not doseq:
999        for k, v in query:
1000            if isinstance(k, bytes):
1001                k = quote_via(k, safe)
1002            else:
1003                k = quote_via(str(k), safe, encoding, errors)
1004
1005            if isinstance(v, bytes):
1006                v = quote_via(v, safe)
1007            else:
1008                v = quote_via(str(v), safe, encoding, errors)
1009            l.append(k + '=' + v)
1010    else:
1011        for k, v in query:
1012            if isinstance(k, bytes):
1013                k = quote_via(k, safe)
1014            else:
1015                k = quote_via(str(k), safe, encoding, errors)
1016
1017            if isinstance(v, bytes):
1018                v = quote_via(v, safe)
1019                l.append(k + '=' + v)
1020            elif isinstance(v, str):
1021                v = quote_via(v, safe, encoding, errors)
1022                l.append(k + '=' + v)
1023            else:
1024                try:
1025                    # Is this a sufficient test for sequence-ness?
1026                    x = len(v)
1027                except TypeError:
1028                    # not a sequence
1029                    v = quote_via(str(v), safe, encoding, errors)
1030                    l.append(k + '=' + v)
1031                else:
1032                    # loop over the sequence
1033                    for elt in v:
1034                        if isinstance(elt, bytes):
1035                            elt = quote_via(elt, safe)
1036                        else:
1037                            elt = quote_via(str(elt), safe, encoding, errors)
1038                        l.append(k + '=' + elt)
1039    return '&'.join(l)
1040
1041
1042def to_bytes(url):
1043    warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
1044                  DeprecationWarning, stacklevel=2)
1045    return _to_bytes(url)
1046
1047
1048def _to_bytes(url):
1049    """to_bytes(u"URL") --> 'URL'."""
1050    # Most URL schemes require ASCII. If that changes, the conversion
1051    # can be relaxed.
1052    # XXX get rid of to_bytes()
1053    if isinstance(url, str):
1054        try:
1055            url = url.encode("ASCII").decode()
1056        except UnicodeError:
1057            raise UnicodeError("URL " + repr(url) +
1058                               " contains non-ASCII characters")
1059    return url
1060
1061
1062def unwrap(url):
1063    """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
1064
1065    The string is returned unchanged if it's not a wrapped URL.
1066    """
1067    url = str(url).strip()
1068    if url[:1] == '<' and url[-1:] == '>':
1069        url = url[1:-1].strip()
1070    if url[:4] == 'URL:':
1071        url = url[4:].strip()
1072    return url
1073
1074
1075def splittype(url):
1076    warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1077                  "use urllib.parse.urlparse() instead",
1078                  DeprecationWarning, stacklevel=2)
1079    return _splittype(url)
1080
1081
1082_typeprog = None
1083def _splittype(url):
1084    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1085    global _typeprog
1086    if _typeprog is None:
1087        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
1088
1089    match = _typeprog.match(url)
1090    if match:
1091        scheme, data = match.groups()
1092        return scheme.lower(), data
1093    return None, url
1094
1095
1096def splithost(url):
1097    warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1098                  "use urllib.parse.urlparse() instead",
1099                  DeprecationWarning, stacklevel=2)
1100    return _splithost(url)
1101
1102
1103_hostprog = None
1104def _splithost(url):
1105    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1106    global _hostprog
1107    if _hostprog is None:
1108        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1109
1110    match = _hostprog.match(url)
1111    if match:
1112        host_port, path = match.groups()
1113        if path and path[0] != '/':
1114            path = '/' + path
1115        return host_port, path
1116    return None, url
1117
1118
1119def splituser(host):
1120    warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1121                  "use urllib.parse.urlparse() instead",
1122                  DeprecationWarning, stacklevel=2)
1123    return _splituser(host)
1124
1125
1126def _splituser(host):
1127    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1128    user, delim, host = host.rpartition('@')
1129    return (user if delim else None), host
1130
1131
1132def splitpasswd(user):
1133    warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1134                  "use urllib.parse.urlparse() instead",
1135                  DeprecationWarning, stacklevel=2)
1136    return _splitpasswd(user)
1137
1138
1139def _splitpasswd(user):
1140    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1141    user, delim, passwd = user.partition(':')
1142    return user, (passwd if delim else None)
1143
1144
1145def splitport(host):
1146    warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1147                  "use urllib.parse.urlparse() instead",
1148                  DeprecationWarning, stacklevel=2)
1149    return _splitport(host)
1150
1151
1152# splittag('/path#tag') --> '/path', 'tag'
1153_portprog = None
1154def _splitport(host):
1155    """splitport('host:port') --> 'host', 'port'."""
1156    global _portprog
1157    if _portprog is None:
1158        _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1159
1160    match = _portprog.fullmatch(host)
1161    if match:
1162        host, port = match.groups()
1163        if port:
1164            return host, port
1165    return host, None
1166
1167
1168def splitnport(host, defport=-1):
1169    warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1170                  "use urllib.parse.urlparse() instead",
1171                  DeprecationWarning, stacklevel=2)
1172    return _splitnport(host, defport)
1173
1174
1175def _splitnport(host, defport=-1):
1176    """Split host and port, returning numeric port.
1177    Return given default port if no ':' found; defaults to -1.
1178    Return numerical port if a valid number is found after ':'.
1179    Return None if ':' but not a valid number."""
1180    host, delim, port = host.rpartition(':')
1181    if not delim:
1182        host = port
1183    elif port:
1184        if port.isdigit() and port.isascii():
1185            nport = int(port)
1186        else:
1187            nport = None
1188        return host, nport
1189    return host, defport
1190
1191
1192def splitquery(url):
1193    warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1194                  "use urllib.parse.urlparse() instead",
1195                  DeprecationWarning, stacklevel=2)
1196    return _splitquery(url)
1197
1198
1199def _splitquery(url):
1200    """splitquery('/path?query') --> '/path', 'query'."""
1201    path, delim, query = url.rpartition('?')
1202    if delim:
1203        return path, query
1204    return url, None
1205
1206
1207def splittag(url):
1208    warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1209                  "use urllib.parse.urlparse() instead",
1210                  DeprecationWarning, stacklevel=2)
1211    return _splittag(url)
1212
1213
1214def _splittag(url):
1215    """splittag('/path#tag') --> '/path', 'tag'."""
1216    path, delim, tag = url.rpartition('#')
1217    if delim:
1218        return path, tag
1219    return url, None
1220
1221
1222def splitattr(url):
1223    warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1224                  "use urllib.parse.urlparse() instead",
1225                  DeprecationWarning, stacklevel=2)
1226    return _splitattr(url)
1227
1228
1229def _splitattr(url):
1230    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1231        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1232    words = url.split(';')
1233    return words[0], words[1:]
1234
1235
1236def splitvalue(attr):
1237    warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1238                  "use urllib.parse.parse_qsl() instead",
1239                  DeprecationWarning, stacklevel=2)
1240    return _splitvalue(attr)
1241
1242
1243def _splitvalue(attr):
1244    """splitvalue('attr=value') --> 'attr', 'value'."""
1245    attr, delim, value = attr.partition('=')
1246    return attr, (value if delim else None)
1247