• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import collections
33import warnings
34
35__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
36           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
37           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
38           "unquote", "unquote_plus", "unquote_to_bytes",
39           "DefragResult", "ParseResult", "SplitResult",
40           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
41
42# A classification of schemes.
43# The empty string classifies URLs with no scheme specified,
44# being the default value returned by “urlsplit” and “urlparse”.
45
46uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
47                 'wais', 'file', 'https', 'shttp', 'mms',
48                 'prospero', 'rtsp', 'rtspu', 'sftp',
49                 'svn', 'svn+ssh', 'ws', 'wss']
50
51uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
52               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
53               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
54               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
55               'ws', 'wss']
56
57uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
58               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
59               'mms', 'sftp', 'tel']
60
61# These are not actually used anymore, but should stay for backwards
62# compatibility.  (They are undocumented, but have a public-looking name.)
63
64non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
65                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
66
67uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
68              'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
69
70uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
71                 'nntp', 'wais', 'https', 'shttp', 'snews',
72                 'file', 'prospero']
73
74# Characters valid in scheme names
75scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
76                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
77                '0123456789'
78                '+-.')
79
80# XXX: Consider replacing with functools.lru_cache
81MAX_CACHE_SIZE = 20
82_parse_cache = {}
83
84def clear_cache():
85    """Clear the parse cache and the quoters cache."""
86    _parse_cache.clear()
87    _safe_quoters.clear()
88
89
90# Helpers for bytes handling
91# For 3.2, we deliberately require applications that
92# handle improperly quoted URLs to do their own
93# decoding and encoding. If valid use cases are
94# presented, we may relax this by using latin-1
95# decoding internally for 3.3
96_implicit_encoding = 'ascii'
97_implicit_errors = 'strict'
98
99def _noop(obj):
100    return obj
101
102def _encode_result(obj, encoding=_implicit_encoding,
103                        errors=_implicit_errors):
104    return obj.encode(encoding, errors)
105
106def _decode_args(args, encoding=_implicit_encoding,
107                       errors=_implicit_errors):
108    return tuple(x.decode(encoding, errors) if x else '' for x in args)
109
110def _coerce_args(*args):
111    # Invokes decode if necessary to create str args
112    # and returns the coerced inputs along with
113    # an appropriate result coercion function
114    #   - noop for str inputs
115    #   - encoding function otherwise
116    str_input = isinstance(args[0], str)
117    for arg in args[1:]:
118        # We special-case the empty string to support the
119        # "scheme=''" default argument to some functions
120        if arg and isinstance(arg, str) != str_input:
121            raise TypeError("Cannot mix str and non-str arguments")
122    if str_input:
123        return args + (_noop,)
124    return _decode_args(args) + (_encode_result,)
125
126# Result objects are more helpful than simple tuples
127class _ResultMixinStr(object):
128    """Standard approach to encoding parsed results from str to bytes"""
129    __slots__ = ()
130
131    def encode(self, encoding='ascii', errors='strict'):
132        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
133
134
135class _ResultMixinBytes(object):
136    """Standard approach to decoding parsed results from bytes to str"""
137    __slots__ = ()
138
139    def decode(self, encoding='ascii', errors='strict'):
140        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
141
142
143class _NetlocResultMixinBase(object):
144    """Shared methods for the parsed result objects containing a netloc element"""
145    __slots__ = ()
146
147    @property
148    def username(self):
149        return self._userinfo[0]
150
151    @property
152    def password(self):
153        return self._userinfo[1]
154
155    @property
156    def hostname(self):
157        hostname = self._hostinfo[0]
158        if not hostname:
159            return None
160        # Scoped IPv6 address may have zone info, which must not be lowercased
161        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
162        separator = '%' if isinstance(hostname, str) else b'%'
163        hostname, percent, zone = hostname.partition(separator)
164        return hostname.lower() + percent + zone
165
166    @property
167    def port(self):
168        port = self._hostinfo[1]
169        if port is not None:
170            try:
171                port = int(port, 10)
172            except ValueError:
173                message = f'Port could not be cast to integer value as {port!r}'
174                raise ValueError(message) from None
175            if not ( 0 <= port <= 65535):
176                raise ValueError("Port out of range 0-65535")
177        return port
178
179
180class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
181    __slots__ = ()
182
183    @property
184    def _userinfo(self):
185        netloc = self.netloc
186        userinfo, have_info, hostinfo = netloc.rpartition('@')
187        if have_info:
188            username, have_password, password = userinfo.partition(':')
189            if not have_password:
190                password = None
191        else:
192            username = password = None
193        return username, password
194
195    @property
196    def _hostinfo(self):
197        netloc = self.netloc
198        _, _, hostinfo = netloc.rpartition('@')
199        _, have_open_br, bracketed = hostinfo.partition('[')
200        if have_open_br:
201            hostname, _, port = bracketed.partition(']')
202            _, _, port = port.partition(':')
203        else:
204            hostname, _, port = hostinfo.partition(':')
205        if not port:
206            port = None
207        return hostname, port
208
209
210class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
211    __slots__ = ()
212
213    @property
214    def _userinfo(self):
215        netloc = self.netloc
216        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
217        if have_info:
218            username, have_password, password = userinfo.partition(b':')
219            if not have_password:
220                password = None
221        else:
222            username = password = None
223        return username, password
224
225    @property
226    def _hostinfo(self):
227        netloc = self.netloc
228        _, _, hostinfo = netloc.rpartition(b'@')
229        _, have_open_br, bracketed = hostinfo.partition(b'[')
230        if have_open_br:
231            hostname, _, port = bracketed.partition(b']')
232            _, _, port = port.partition(b':')
233        else:
234            hostname, _, port = hostinfo.partition(b':')
235        if not port:
236            port = None
237        return hostname, port
238
239
240from collections import namedtuple
241
242_DefragResultBase = namedtuple('DefragResult', 'url fragment')
243_SplitResultBase = namedtuple(
244    'SplitResult', 'scheme netloc path query fragment')
245_ParseResultBase = namedtuple(
246    'ParseResult', 'scheme netloc path params query fragment')
247
248_DefragResultBase.__doc__ = """
249DefragResult(url, fragment)
250
251A 2-tuple that contains the url without fragment identifier and the fragment
252identifier as a separate argument.
253"""
254
255_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
256
257_DefragResultBase.fragment.__doc__ = """
258Fragment identifier separated from URL, that allows indirect identification of a
259secondary resource by reference to a primary resource and additional identifying
260information.
261"""
262
263_SplitResultBase.__doc__ = """
264SplitResult(scheme, netloc, path, query, fragment)
265
266A 5-tuple that contains the different components of a URL. Similar to
267ParseResult, but does not split params.
268"""
269
270_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
271
272_SplitResultBase.netloc.__doc__ = """
273Network location where the request is made to.
274"""
275
276_SplitResultBase.path.__doc__ = """
277The hierarchical path, such as the path to a file to download.
278"""
279
280_SplitResultBase.query.__doc__ = """
281The query component, that contains non-hierarchical data, that along with data
282in path component, identifies a resource in the scope of URI's scheme and
283network location.
284"""
285
286_SplitResultBase.fragment.__doc__ = """
287Fragment identifier, that allows indirect identification of a secondary resource
288by reference to a primary resource and additional identifying information.
289"""
290
291_ParseResultBase.__doc__ = """
292ParseResult(scheme, netloc, path, params, query, fragment)
293
294A 6-tuple that contains components of a parsed URL.
295"""
296
297_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
298_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
299_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
300_ParseResultBase.params.__doc__ = """
301Parameters for last path element used to dereference the URI in order to provide
302access to perform some operation on the resource.
303"""
304
305_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
306_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
307
308
309# For backwards compatibility, alias _NetlocResultMixinStr
310# ResultBase is no longer part of the documented API, but it is
311# retained since deprecating it isn't worth the hassle
312ResultBase = _NetlocResultMixinStr
313
314# Structured result objects for string data
315class DefragResult(_DefragResultBase, _ResultMixinStr):
316    __slots__ = ()
317    def geturl(self):
318        if self.fragment:
319            return self.url + '#' + self.fragment
320        else:
321            return self.url
322
323class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
324    __slots__ = ()
325    def geturl(self):
326        return urlunsplit(self)
327
328class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
329    __slots__ = ()
330    def geturl(self):
331        return urlunparse(self)
332
333# Structured result objects for bytes data
334class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
335    __slots__ = ()
336    def geturl(self):
337        if self.fragment:
338            return self.url + b'#' + self.fragment
339        else:
340            return self.url
341
342class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
343    __slots__ = ()
344    def geturl(self):
345        return urlunsplit(self)
346
347class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
348    __slots__ = ()
349    def geturl(self):
350        return urlunparse(self)
351
352# Set up the encode/decode result pairs
353def _fix_result_transcoding():
354    _result_pairs = (
355        (DefragResult, DefragResultBytes),
356        (SplitResult, SplitResultBytes),
357        (ParseResult, ParseResultBytes),
358    )
359    for _decoded, _encoded in _result_pairs:
360        _decoded._encoded_counterpart = _encoded
361        _encoded._decoded_counterpart = _decoded
362
363_fix_result_transcoding()
364del _fix_result_transcoding
365
366def urlparse(url, scheme='', allow_fragments=True):
367    """Parse a URL into 6 components:
368    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
369    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
370    Note that we don't break the components up in smaller bits
371    (e.g. netloc is a single string) and we don't expand % escapes."""
372    url, scheme, _coerce_result = _coerce_args(url, scheme)
373    splitresult = urlsplit(url, scheme, allow_fragments)
374    scheme, netloc, url, query, fragment = splitresult
375    if scheme in uses_params and ';' in url:
376        url, params = _splitparams(url)
377    else:
378        params = ''
379    result = ParseResult(scheme, netloc, url, params, query, fragment)
380    return _coerce_result(result)
381
382def _splitparams(url):
383    if '/'  in url:
384        i = url.find(';', url.rfind('/'))
385        if i < 0:
386            return url, ''
387    else:
388        i = url.find(';')
389    return url[:i], url[i+1:]
390
391def _splitnetloc(url, start=0):
392    delim = len(url)   # position of end of domain part of url, default is end
393    for c in '/?#':    # look for delimiters; the order is NOT important
394        wdelim = url.find(c, start)        # find first of this delim
395        if wdelim >= 0:                    # if found
396            delim = min(delim, wdelim)     # use earliest delim position
397    return url[start:delim], url[delim:]   # return (domain, rest)
398
399def _checknetloc(netloc):
400    if not netloc or netloc.isascii():
401        return
402    # looking for characters like \u2100 that expand to 'a/c'
403    # IDNA uses NFKC equivalence, so normalize for this check
404    import unicodedata
405    n = netloc.replace('@', '')   # ignore characters already included
406    n = n.replace(':', '')        # but not the surrounding text
407    n = n.replace('#', '')
408    n = n.replace('?', '')
409    netloc2 = unicodedata.normalize('NFKC', n)
410    if n == netloc2:
411        return
412    for c in '/?#@:':
413        if c in netloc2:
414            raise ValueError("netloc '" + netloc + "' contains invalid " +
415                             "characters under NFKC normalization")
416
417def urlsplit(url, scheme='', allow_fragments=True):
418    """Parse a URL into 5 components:
419    <scheme>://<netloc>/<path>?<query>#<fragment>
420    Return a 5-tuple: (scheme, netloc, path, query, fragment).
421    Note that we don't break the components up in smaller bits
422    (e.g. netloc is a single string) and we don't expand % escapes."""
423    url, scheme, _coerce_result = _coerce_args(url, scheme)
424    allow_fragments = bool(allow_fragments)
425    key = url, scheme, allow_fragments, type(url), type(scheme)
426    cached = _parse_cache.get(key, None)
427    if cached:
428        return _coerce_result(cached)
429    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
430        clear_cache()
431    netloc = query = fragment = ''
432    i = url.find(':')
433    if i > 0:
434        for c in url[:i]:
435            if c not in scheme_chars:
436                break
437        else:
438            scheme, url = url[:i].lower(), url[i+1:]
439
440    if url[:2] == '//':
441        netloc, url = _splitnetloc(url, 2)
442        if (('[' in netloc and ']' not in netloc) or
443                (']' in netloc and '[' not in netloc)):
444            raise ValueError("Invalid IPv6 URL")
445    if allow_fragments and '#' in url:
446        url, fragment = url.split('#', 1)
447    if '?' in url:
448        url, query = url.split('?', 1)
449    _checknetloc(netloc)
450    v = SplitResult(scheme, netloc, url, query, fragment)
451    _parse_cache[key] = v
452    return _coerce_result(v)
453
454def urlunparse(components):
455    """Put a parsed URL back together again.  This may result in a
456    slightly different, but equivalent URL, if the URL that was parsed
457    originally had redundant delimiters, e.g. a ? with an empty query
458    (the draft states that these are equivalent)."""
459    scheme, netloc, url, params, query, fragment, _coerce_result = (
460                                                  _coerce_args(*components))
461    if params:
462        url = "%s;%s" % (url, params)
463    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
464
465def urlunsplit(components):
466    """Combine the elements of a tuple as returned by urlsplit() into a
467    complete URL as a string. The data argument can be any five-item iterable.
468    This may result in a slightly different, but equivalent URL, if the URL that
469    was parsed originally had unnecessary delimiters (for example, a ? with an
470    empty query; the RFC states that these are equivalent)."""
471    scheme, netloc, url, query, fragment, _coerce_result = (
472                                          _coerce_args(*components))
473    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
474        if url and url[:1] != '/': url = '/' + url
475        url = '//' + (netloc or '') + url
476    if scheme:
477        url = scheme + ':' + url
478    if query:
479        url = url + '?' + query
480    if fragment:
481        url = url + '#' + fragment
482    return _coerce_result(url)
483
484def urljoin(base, url, allow_fragments=True):
485    """Join a base URL and a possibly relative URL to form an absolute
486    interpretation of the latter."""
487    if not base:
488        return url
489    if not url:
490        return base
491
492    base, url, _coerce_result = _coerce_args(base, url)
493    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
494            urlparse(base, '', allow_fragments)
495    scheme, netloc, path, params, query, fragment = \
496            urlparse(url, bscheme, allow_fragments)
497
498    if scheme != bscheme or scheme not in uses_relative:
499        return _coerce_result(url)
500    if scheme in uses_netloc:
501        if netloc:
502            return _coerce_result(urlunparse((scheme, netloc, path,
503                                              params, query, fragment)))
504        netloc = bnetloc
505
506    if not path and not params:
507        path = bpath
508        params = bparams
509        if not query:
510            query = bquery
511        return _coerce_result(urlunparse((scheme, netloc, path,
512                                          params, query, fragment)))
513
514    base_parts = bpath.split('/')
515    if base_parts[-1] != '':
516        # the last item is not a directory, so will not be taken into account
517        # in resolving the relative path
518        del base_parts[-1]
519
520    # for rfc3986, ignore all base path should the first character be root.
521    if path[:1] == '/':
522        segments = path.split('/')
523    else:
524        segments = base_parts + path.split('/')
525        # filter out elements that would cause redundant slashes on re-joining
526        # the resolved_path
527        segments[1:-1] = filter(None, segments[1:-1])
528
529    resolved_path = []
530
531    for seg in segments:
532        if seg == '..':
533            try:
534                resolved_path.pop()
535            except IndexError:
536                # ignore any .. segments that would otherwise cause an IndexError
537                # when popped from resolved_path if resolving for rfc3986
538                pass
539        elif seg == '.':
540            continue
541        else:
542            resolved_path.append(seg)
543
544    if segments[-1] in ('.', '..'):
545        # do some post-processing here. if the last segment was a relative dir,
546        # then we need to append the trailing '/'
547        resolved_path.append('')
548
549    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
550        resolved_path) or '/', params, query, fragment)))
551
552
553def urldefrag(url):
554    """Removes any existing fragment from URL.
555
556    Returns a tuple of the defragmented URL and the fragment.  If
557    the URL contained no fragments, the second element is the
558    empty string.
559    """
560    url, _coerce_result = _coerce_args(url)
561    if '#' in url:
562        s, n, p, a, q, frag = urlparse(url)
563        defrag = urlunparse((s, n, p, a, q, ''))
564    else:
565        frag = ''
566        defrag = url
567    return _coerce_result(DefragResult(defrag, frag))
568
569_hexdig = '0123456789ABCDEFabcdef'
570_hextobyte = None
571
572def unquote_to_bytes(string):
573    """unquote_to_bytes('abc%20def') -> b'abc def'."""
574    # Note: strings are encoded as UTF-8. This is only an issue if it contains
575    # unescaped non-ASCII characters, which URIs should not.
576    if not string:
577        # Is it a string-like object?
578        string.split
579        return b''
580    if isinstance(string, str):
581        string = string.encode('utf-8')
582    bits = string.split(b'%')
583    if len(bits) == 1:
584        return string
585    res = [bits[0]]
586    append = res.append
587    # Delay the initialization of the table to not waste memory
588    # if the function is never called
589    global _hextobyte
590    if _hextobyte is None:
591        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
592                      for a in _hexdig for b in _hexdig}
593    for item in bits[1:]:
594        try:
595            append(_hextobyte[item[:2]])
596            append(item[2:])
597        except KeyError:
598            append(b'%')
599            append(item)
600    return b''.join(res)
601
602_asciire = re.compile('([\x00-\x7f]+)')
603
604def unquote(string, encoding='utf-8', errors='replace'):
605    """Replace %xx escapes by their single-character equivalent. The optional
606    encoding and errors parameters specify how to decode percent-encoded
607    sequences into Unicode characters, as accepted by the bytes.decode()
608    method.
609    By default, percent-encoded sequences are decoded with UTF-8, and invalid
610    sequences are replaced by a placeholder character.
611
612    unquote('abc%20def') -> 'abc def'.
613    """
614    if '%' not in string:
615        string.split
616        return string
617    if encoding is None:
618        encoding = 'utf-8'
619    if errors is None:
620        errors = 'replace'
621    bits = _asciire.split(string)
622    res = [bits[0]]
623    append = res.append
624    for i in range(1, len(bits), 2):
625        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
626        append(bits[i + 1])
627    return ''.join(res)
628
629
630def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
631             encoding='utf-8', errors='replace', max_num_fields=None):
632    """Parse a query given as a string argument.
633
634        Arguments:
635
636        qs: percent-encoded query string to be parsed
637
638        keep_blank_values: flag indicating whether blank values in
639            percent-encoded queries should be treated as blank strings.
640            A true value indicates that blanks should be retained as
641            blank strings.  The default false value indicates that
642            blank values are to be ignored and treated as if they were
643            not included.
644
645        strict_parsing: flag indicating what to do with parsing errors.
646            If false (the default), errors are silently ignored.
647            If true, errors raise a ValueError exception.
648
649        encoding and errors: specify how to decode percent-encoded sequences
650            into Unicode characters, as accepted by the bytes.decode() method.
651
652        max_num_fields: int. If set, then throws a ValueError if there
653            are more than n fields read by parse_qsl().
654
655        Returns a dictionary.
656    """
657    parsed_result = {}
658    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
659                      encoding=encoding, errors=errors,
660                      max_num_fields=max_num_fields)
661    for name, value in pairs:
662        if name in parsed_result:
663            parsed_result[name].append(value)
664        else:
665            parsed_result[name] = [value]
666    return parsed_result
667
668
669def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
670              encoding='utf-8', errors='replace', max_num_fields=None):
671    """Parse a query given as a string argument.
672
673        Arguments:
674
675        qs: percent-encoded query string to be parsed
676
677        keep_blank_values: flag indicating whether blank values in
678            percent-encoded queries should be treated as blank strings.
679            A true value indicates that blanks should be retained as blank
680            strings.  The default false value indicates that blank values
681            are to be ignored and treated as if they were  not included.
682
683        strict_parsing: flag indicating what to do with parsing errors. If
684            false (the default), errors are silently ignored. If true,
685            errors raise a ValueError exception.
686
687        encoding and errors: specify how to decode percent-encoded sequences
688            into Unicode characters, as accepted by the bytes.decode() method.
689
690        max_num_fields: int. If set, then throws a ValueError
691            if there are more than n fields read by parse_qsl().
692
693        Returns a list, as G-d intended.
694    """
695    qs, _coerce_result = _coerce_args(qs)
696
697    # If max_num_fields is defined then check that the number of fields
698    # is less than max_num_fields. This prevents a memory exhaustion DOS
699    # attack via post bodies with many fields.
700    if max_num_fields is not None:
701        num_fields = 1 + qs.count('&') + qs.count(';')
702        if max_num_fields < num_fields:
703            raise ValueError('Max number of fields exceeded')
704
705    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
706    r = []
707    for name_value in pairs:
708        if not name_value and not strict_parsing:
709            continue
710        nv = name_value.split('=', 1)
711        if len(nv) != 2:
712            if strict_parsing:
713                raise ValueError("bad query field: %r" % (name_value,))
714            # Handle case of a control-name with no equal sign
715            if keep_blank_values:
716                nv.append('')
717            else:
718                continue
719        if len(nv[1]) or keep_blank_values:
720            name = nv[0].replace('+', ' ')
721            name = unquote(name, encoding=encoding, errors=errors)
722            name = _coerce_result(name)
723            value = nv[1].replace('+', ' ')
724            value = unquote(value, encoding=encoding, errors=errors)
725            value = _coerce_result(value)
726            r.append((name, value))
727    return r
728
729def unquote_plus(string, encoding='utf-8', errors='replace'):
730    """Like unquote(), but also replace plus signs by spaces, as required for
731    unquoting HTML form values.
732
733    unquote_plus('%7e/abc+def') -> '~/abc def'
734    """
735    string = string.replace('+', ' ')
736    return unquote(string, encoding, errors)
737
738_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
739                         b'abcdefghijklmnopqrstuvwxyz'
740                         b'0123456789'
741                         b'_.-~')
742_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
743_safe_quoters = {}
744
745class Quoter(collections.defaultdict):
746    """A mapping from bytes (in range(0,256)) to strings.
747
748    String values are percent-encoded byte values, unless the key < 128, and
749    in the "safe" set (either the specified safe set, or default set).
750    """
751    # Keeps a cache internally, using defaultdict, for efficiency (lookups
752    # of cached keys don't call Python code at all).
753    def __init__(self, safe):
754        """safe: bytes object."""
755        self.safe = _ALWAYS_SAFE.union(safe)
756
757    def __repr__(self):
758        # Without this, will just display as a defaultdict
759        return "<%s %r>" % (self.__class__.__name__, dict(self))
760
761    def __missing__(self, b):
762        # Handle a cache miss. Store quoted string in cache and return.
763        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
764        self[b] = res
765        return res
766
767def quote(string, safe='/', encoding=None, errors=None):
768    """quote('abc def') -> 'abc%20def'
769
770    Each part of a URL, e.g. the path info, the query, etc., has a
771    different set of reserved characters that must be quoted. The
772    quote function offers a cautious (not minimal) way to quote a
773    string for most of these parts.
774
775    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
776    the following (un)reserved characters.
777
778    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
779    reserved      = gen-delims / sub-delims
780    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
781    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
782                  / "*" / "+" / "," / ";" / "="
783
784    Each of the reserved characters is reserved in some component of a URL,
785    but not necessarily in all of them.
786
787    The quote function %-escapes all characters that are neither in the
788    unreserved chars ("always safe") nor the additional chars set via the
789    safe arg.
790
791    The default for the safe arg is '/'. The character is reserved, but in
792    typical usage the quote function is being called on a path where the
793    existing slash characters are to be preserved.
794
795    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
796    Now, "~" is included in the set of unreserved characters.
797
798    string and safe may be either str or bytes objects. encoding and errors
799    must not be specified if string is a bytes object.
800
801    The optional encoding and errors parameters specify how to deal with
802    non-ASCII characters, as accepted by the str.encode method.
803    By default, encoding='utf-8' (characters are encoded with UTF-8), and
804    errors='strict' (unsupported characters raise a UnicodeEncodeError).
805    """
806    if isinstance(string, str):
807        if not string:
808            return string
809        if encoding is None:
810            encoding = 'utf-8'
811        if errors is None:
812            errors = 'strict'
813        string = string.encode(encoding, errors)
814    else:
815        if encoding is not None:
816            raise TypeError("quote() doesn't support 'encoding' for bytes")
817        if errors is not None:
818            raise TypeError("quote() doesn't support 'errors' for bytes")
819    return quote_from_bytes(string, safe)
820
821def quote_plus(string, safe='', encoding=None, errors=None):
822    """Like quote(), but also replace ' ' with '+', as required for quoting
823    HTML form values. Plus signs in the original string are escaped unless
824    they are included in safe. It also does not have safe default to '/'.
825    """
826    # Check if ' ' in string, where string may either be a str or bytes.  If
827    # there are no spaces, the regular quote will produce the right answer.
828    if ((isinstance(string, str) and ' ' not in string) or
829        (isinstance(string, bytes) and b' ' not in string)):
830        return quote(string, safe, encoding, errors)
831    if isinstance(safe, str):
832        space = ' '
833    else:
834        space = b' '
835    string = quote(string, safe + space, encoding, errors)
836    return string.replace(' ', '+')
837
838def quote_from_bytes(bs, safe='/'):
839    """Like quote(), but accepts a bytes object rather than a str, and does
840    not perform string-to-bytes encoding.  It always returns an ASCII string.
841    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
842    """
843    if not isinstance(bs, (bytes, bytearray)):
844        raise TypeError("quote_from_bytes() expected bytes")
845    if not bs:
846        return ''
847    if isinstance(safe, str):
848        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
849        safe = safe.encode('ascii', 'ignore')
850    else:
851        safe = bytes([c for c in safe if c < 128])
852    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
853        return bs.decode()
854    try:
855        quoter = _safe_quoters[safe]
856    except KeyError:
857        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
858    return ''.join([quoter(char) for char in bs])
859
860def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
861              quote_via=quote_plus):
862    """Encode a dict or sequence of two-element tuples into a URL query string.
863
864    If any values in the query arg are sequences and doseq is true, each
865    sequence element is converted to a separate parameter.
866
867    If the query arg is a sequence of two-element tuples, the order of the
868    parameters in the output will match the order of parameters in the
869    input.
870
871    The components of a query arg may each be either a string or a bytes type.
872
873    The safe, encoding, and errors parameters are passed down to the function
874    specified by quote_via (encoding and errors only if a component is a str).
875    """
876
877    if hasattr(query, "items"):
878        query = query.items()
879    else:
880        # It's a bother at times that strings and string-like objects are
881        # sequences.
882        try:
883            # non-sequence items should not work with len()
884            # non-empty strings will fail this
885            if len(query) and not isinstance(query[0], tuple):
886                raise TypeError
887            # Zero-length sequences of all types will get here and succeed,
888            # but that's a minor nit.  Since the original implementation
889            # allowed empty dicts that type of behavior probably should be
890            # preserved for consistency
891        except TypeError:
892            ty, va, tb = sys.exc_info()
893            raise TypeError("not a valid non-string sequence "
894                            "or mapping object").with_traceback(tb)
895
896    l = []
897    if not doseq:
898        for k, v in query:
899            if isinstance(k, bytes):
900                k = quote_via(k, safe)
901            else:
902                k = quote_via(str(k), safe, encoding, errors)
903
904            if isinstance(v, bytes):
905                v = quote_via(v, safe)
906            else:
907                v = quote_via(str(v), safe, encoding, errors)
908            l.append(k + '=' + v)
909    else:
910        for k, v in query:
911            if isinstance(k, bytes):
912                k = quote_via(k, safe)
913            else:
914                k = quote_via(str(k), safe, encoding, errors)
915
916            if isinstance(v, bytes):
917                v = quote_via(v, safe)
918                l.append(k + '=' + v)
919            elif isinstance(v, str):
920                v = quote_via(v, safe, encoding, errors)
921                l.append(k + '=' + v)
922            else:
923                try:
924                    # Is this a sufficient test for sequence-ness?
925                    x = len(v)
926                except TypeError:
927                    # not a sequence
928                    v = quote_via(str(v), safe, encoding, errors)
929                    l.append(k + '=' + v)
930                else:
931                    # loop over the sequence
932                    for elt in v:
933                        if isinstance(elt, bytes):
934                            elt = quote_via(elt, safe)
935                        else:
936                            elt = quote_via(str(elt), safe, encoding, errors)
937                        l.append(k + '=' + elt)
938    return '&'.join(l)
939
940
941def to_bytes(url):
942    warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
943                  DeprecationWarning, stacklevel=2)
944    return _to_bytes(url)
945
946
947def _to_bytes(url):
948    """to_bytes(u"URL") --> 'URL'."""
949    # Most URL schemes require ASCII. If that changes, the conversion
950    # can be relaxed.
951    # XXX get rid of to_bytes()
952    if isinstance(url, str):
953        try:
954            url = url.encode("ASCII").decode()
955        except UnicodeError:
956            raise UnicodeError("URL " + repr(url) +
957                               " contains non-ASCII characters")
958    return url
959
960
961def unwrap(url):
962    """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
963
964    The string is returned unchanged if it's not a wrapped URL.
965    """
966    url = str(url).strip()
967    if url[:1] == '<' and url[-1:] == '>':
968        url = url[1:-1].strip()
969    if url[:4] == 'URL:':
970        url = url[4:].strip()
971    return url
972
973
974def splittype(url):
975    warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
976                  "use urllib.parse.urlparse() instead",
977                  DeprecationWarning, stacklevel=2)
978    return _splittype(url)
979
980
981_typeprog = None
982def _splittype(url):
983    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
984    global _typeprog
985    if _typeprog is None:
986        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
987
988    match = _typeprog.match(url)
989    if match:
990        scheme, data = match.groups()
991        return scheme.lower(), data
992    return None, url
993
994
995def splithost(url):
996    warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
997                  "use urllib.parse.urlparse() instead",
998                  DeprecationWarning, stacklevel=2)
999    return _splithost(url)
1000
1001
1002_hostprog = None
1003def _splithost(url):
1004    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1005    global _hostprog
1006    if _hostprog is None:
1007        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1008
1009    match = _hostprog.match(url)
1010    if match:
1011        host_port, path = match.groups()
1012        if path and path[0] != '/':
1013            path = '/' + path
1014        return host_port, path
1015    return None, url
1016
1017
1018def splituser(host):
1019    warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1020                  "use urllib.parse.urlparse() instead",
1021                  DeprecationWarning, stacklevel=2)
1022    return _splituser(host)
1023
1024
1025def _splituser(host):
1026    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1027    user, delim, host = host.rpartition('@')
1028    return (user if delim else None), host
1029
1030
1031def splitpasswd(user):
1032    warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1033                  "use urllib.parse.urlparse() instead",
1034                  DeprecationWarning, stacklevel=2)
1035    return _splitpasswd(user)
1036
1037
1038def _splitpasswd(user):
1039    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1040    user, delim, passwd = user.partition(':')
1041    return user, (passwd if delim else None)
1042
1043
1044def splitport(host):
1045    warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1046                  "use urllib.parse.urlparse() instead",
1047                  DeprecationWarning, stacklevel=2)
1048    return _splitport(host)
1049
1050
1051# splittag('/path#tag') --> '/path', 'tag'
1052_portprog = None
1053def _splitport(host):
1054    """splitport('host:port') --> 'host', 'port'."""
1055    global _portprog
1056    if _portprog is None:
1057        _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
1058
1059    match = _portprog.match(host)
1060    if match:
1061        host, port = match.groups()
1062        if port:
1063            return host, port
1064    return host, None
1065
1066
1067def splitnport(host, defport=-1):
1068    warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1069                  "use urllib.parse.urlparse() instead",
1070                  DeprecationWarning, stacklevel=2)
1071    return _splitnport(host, defport)
1072
1073
1074def _splitnport(host, defport=-1):
1075    """Split host and port, returning numeric port.
1076    Return given default port if no ':' found; defaults to -1.
1077    Return numerical port if a valid number are found after ':'.
1078    Return None if ':' but not a valid number."""
1079    host, delim, port = host.rpartition(':')
1080    if not delim:
1081        host = port
1082    elif port:
1083        try:
1084            nport = int(port)
1085        except ValueError:
1086            nport = None
1087        return host, nport
1088    return host, defport
1089
1090
1091def splitquery(url):
1092    warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1093                  "use urllib.parse.urlparse() instead",
1094                  DeprecationWarning, stacklevel=2)
1095    return _splitquery(url)
1096
1097
1098def _splitquery(url):
1099    """splitquery('/path?query') --> '/path', 'query'."""
1100    path, delim, query = url.rpartition('?')
1101    if delim:
1102        return path, query
1103    return url, None
1104
1105
1106def splittag(url):
1107    warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1108                  "use urllib.parse.urlparse() instead",
1109                  DeprecationWarning, stacklevel=2)
1110    return _splittag(url)
1111
1112
1113def _splittag(url):
1114    """splittag('/path#tag') --> '/path', 'tag'."""
1115    path, delim, tag = url.rpartition('#')
1116    if delim:
1117        return path, tag
1118    return url, None
1119
1120
1121def splitattr(url):
1122    warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1123                  "use urllib.parse.urlparse() instead",
1124                  DeprecationWarning, stacklevel=2)
1125    return _splitattr(url)
1126
1127
1128def _splitattr(url):
1129    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1130        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1131    words = url.split(';')
1132    return words[0], words[1:]
1133
1134
1135def splitvalue(attr):
1136    warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1137                  "use urllib.parse.parse_qsl() instead",
1138                  DeprecationWarning, stacklevel=2)
1139    return _splitvalue(attr)
1140
1141
1142def _splitvalue(attr):
1143    """splitvalue('attr=value') --> 'attr', 'value'."""
1144    attr, delim, value = attr.partition('=')
1145    return attr, (value if delim else None)
1146