• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import collections
33
34__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
35           "urlsplit", "urlunsplit", "urlencode", "parse_qs",
36           "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
37           "unquote", "unquote_plus", "unquote_to_bytes",
38           "DefragResult", "ParseResult", "SplitResult",
39           "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
40
41# A classification of schemes.
42# The empty string classifies URLs with no scheme specified,
43# being the default value returned by “urlsplit” and “urlparse”.
44
45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
46                 'wais', 'file', 'https', 'shttp', 'mms',
47                 'prospero', 'rtsp', 'rtspu', 'sftp',
48                 'svn', 'svn+ssh', 'ws', 'wss']
49
50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
51               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
52               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
53               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
54               'ws', 'wss']
55
56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
57               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
58               'mms', 'sftp', 'tel']
59
60# These are not actually used anymore, but should stay for backwards
61# compatibility.  (They are undocumented, but have a public-looking name.)
62
63non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
64                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
65
66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
67              'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
68
69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
70                 'nntp', 'wais', 'https', 'shttp', 'snews',
71                 'file', 'prospero']
72
73# Characters valid in scheme names
74scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
75                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
76                '0123456789'
77                '+-.')
78
79# XXX: Consider replacing with functools.lru_cache
80MAX_CACHE_SIZE = 20
81_parse_cache = {}
82
83def clear_cache():
84    """Clear the parse cache and the quoters cache."""
85    _parse_cache.clear()
86    _safe_quoters.clear()
87
88
89# Helpers for bytes handling
90# For 3.2, we deliberately require applications that
91# handle improperly quoted URLs to do their own
92# decoding and encoding. If valid use cases are
93# presented, we may relax this by using latin-1
94# decoding internally for 3.3
95_implicit_encoding = 'ascii'
96_implicit_errors = 'strict'
97
98def _noop(obj):
99    return obj
100
101def _encode_result(obj, encoding=_implicit_encoding,
102                        errors=_implicit_errors):
103    return obj.encode(encoding, errors)
104
105def _decode_args(args, encoding=_implicit_encoding,
106                       errors=_implicit_errors):
107    return tuple(x.decode(encoding, errors) if x else '' for x in args)
108
109def _coerce_args(*args):
110    # Invokes decode if necessary to create str args
111    # and returns the coerced inputs along with
112    # an appropriate result coercion function
113    #   - noop for str inputs
114    #   - encoding function otherwise
115    str_input = isinstance(args[0], str)
116    for arg in args[1:]:
117        # We special-case the empty string to support the
118        # "scheme=''" default argument to some functions
119        if arg and isinstance(arg, str) != str_input:
120            raise TypeError("Cannot mix str and non-str arguments")
121    if str_input:
122        return args + (_noop,)
123    return _decode_args(args) + (_encode_result,)
124
125# Result objects are more helpful than simple tuples
126class _ResultMixinStr(object):
127    """Standard approach to encoding parsed results from str to bytes"""
128    __slots__ = ()
129
130    def encode(self, encoding='ascii', errors='strict'):
131        return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
132
133
134class _ResultMixinBytes(object):
135    """Standard approach to decoding parsed results from bytes to str"""
136    __slots__ = ()
137
138    def decode(self, encoding='ascii', errors='strict'):
139        return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
140
141
142class _NetlocResultMixinBase(object):
143    """Shared methods for the parsed result objects containing a netloc element"""
144    __slots__ = ()
145
146    @property
147    def username(self):
148        return self._userinfo[0]
149
150    @property
151    def password(self):
152        return self._userinfo[1]
153
154    @property
155    def hostname(self):
156        hostname = self._hostinfo[0]
157        if not hostname:
158            return None
159        # Scoped IPv6 address may have zone info, which must not be lowercased
160        # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
161        separator = '%' if isinstance(hostname, str) else b'%'
162        hostname, percent, zone = hostname.partition(separator)
163        return hostname.lower() + percent + zone
164
165    @property
166    def port(self):
167        port = self._hostinfo[1]
168        if port is not None:
169            port = int(port, 10)
170            if not ( 0 <= port <= 65535):
171                raise ValueError("Port out of range 0-65535")
172        return port
173
174
175class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
176    __slots__ = ()
177
178    @property
179    def _userinfo(self):
180        netloc = self.netloc
181        userinfo, have_info, hostinfo = netloc.rpartition('@')
182        if have_info:
183            username, have_password, password = userinfo.partition(':')
184            if not have_password:
185                password = None
186        else:
187            username = password = None
188        return username, password
189
190    @property
191    def _hostinfo(self):
192        netloc = self.netloc
193        _, _, hostinfo = netloc.rpartition('@')
194        _, have_open_br, bracketed = hostinfo.partition('[')
195        if have_open_br:
196            hostname, _, port = bracketed.partition(']')
197            _, _, port = port.partition(':')
198        else:
199            hostname, _, port = hostinfo.partition(':')
200        if not port:
201            port = None
202        return hostname, port
203
204
205class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
206    __slots__ = ()
207
208    @property
209    def _userinfo(self):
210        netloc = self.netloc
211        userinfo, have_info, hostinfo = netloc.rpartition(b'@')
212        if have_info:
213            username, have_password, password = userinfo.partition(b':')
214            if not have_password:
215                password = None
216        else:
217            username = password = None
218        return username, password
219
220    @property
221    def _hostinfo(self):
222        netloc = self.netloc
223        _, _, hostinfo = netloc.rpartition(b'@')
224        _, have_open_br, bracketed = hostinfo.partition(b'[')
225        if have_open_br:
226            hostname, _, port = bracketed.partition(b']')
227            _, _, port = port.partition(b':')
228        else:
229            hostname, _, port = hostinfo.partition(b':')
230        if not port:
231            port = None
232        return hostname, port
233
234
235from collections import namedtuple
236
237_DefragResultBase = namedtuple('DefragResult', 'url fragment')
238_SplitResultBase = namedtuple(
239    'SplitResult', 'scheme netloc path query fragment')
240_ParseResultBase = namedtuple(
241    'ParseResult', 'scheme netloc path params query fragment')
242
243_DefragResultBase.__doc__ = """
244DefragResult(url, fragment)
245
246A 2-tuple that contains the url without fragment identifier and the fragment
247identifier as a separate argument.
248"""
249
250_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
251
252_DefragResultBase.fragment.__doc__ = """
253Fragment identifier separated from URL, that allows indirect identification of a
254secondary resource by reference to a primary resource and additional identifying
255information.
256"""
257
258_SplitResultBase.__doc__ = """
259SplitResult(scheme, netloc, path, query, fragment)
260
261A 5-tuple that contains the different components of a URL. Similar to
262ParseResult, but does not split params.
263"""
264
265_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
266
267_SplitResultBase.netloc.__doc__ = """
268Network location where the request is made to.
269"""
270
271_SplitResultBase.path.__doc__ = """
272The hierarchical path, such as the path to a file to download.
273"""
274
275_SplitResultBase.query.__doc__ = """
276The query component, that contains non-hierarchical data, that along with data
277in path component, identifies a resource in the scope of URI's scheme and
278network location.
279"""
280
281_SplitResultBase.fragment.__doc__ = """
282Fragment identifier, that allows indirect identification of a secondary resource
283by reference to a primary resource and additional identifying information.
284"""
285
286_ParseResultBase.__doc__ = """
287ParseResult(scheme, netloc, path, params,  query, fragment)
288
289A 6-tuple that contains components of a parsed URL.
290"""
291
292_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
293_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
294_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
295_ParseResultBase.params.__doc__ = """
296Parameters for last path element used to dereference the URI in order to provide
297access to perform some operation on the resource.
298"""
299
300_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
301_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
302
303
304# For backwards compatibility, alias _NetlocResultMixinStr
305# ResultBase is no longer part of the documented API, but it is
306# retained since deprecating it isn't worth the hassle
307ResultBase = _NetlocResultMixinStr
308
309# Structured result objects for string data
310class DefragResult(_DefragResultBase, _ResultMixinStr):
311    __slots__ = ()
312    def geturl(self):
313        if self.fragment:
314            return self.url + '#' + self.fragment
315        else:
316            return self.url
317
318class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
319    __slots__ = ()
320    def geturl(self):
321        return urlunsplit(self)
322
323class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
324    __slots__ = ()
325    def geturl(self):
326        return urlunparse(self)
327
328# Structured result objects for bytes data
329class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
330    __slots__ = ()
331    def geturl(self):
332        if self.fragment:
333            return self.url + b'#' + self.fragment
334        else:
335            return self.url
336
337class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
338    __slots__ = ()
339    def geturl(self):
340        return urlunsplit(self)
341
342class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
343    __slots__ = ()
344    def geturl(self):
345        return urlunparse(self)
346
347# Set up the encode/decode result pairs
348def _fix_result_transcoding():
349    _result_pairs = (
350        (DefragResult, DefragResultBytes),
351        (SplitResult, SplitResultBytes),
352        (ParseResult, ParseResultBytes),
353    )
354    for _decoded, _encoded in _result_pairs:
355        _decoded._encoded_counterpart = _encoded
356        _encoded._decoded_counterpart = _decoded
357
358_fix_result_transcoding()
359del _fix_result_transcoding
360
361def urlparse(url, scheme='', allow_fragments=True):
362    """Parse a URL into 6 components:
363    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
364    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
365    Note that we don't break the components up in smaller bits
366    (e.g. netloc is a single string) and we don't expand % escapes."""
367    url, scheme, _coerce_result = _coerce_args(url, scheme)
368    splitresult = urlsplit(url, scheme, allow_fragments)
369    scheme, netloc, url, query, fragment = splitresult
370    if scheme in uses_params and ';' in url:
371        url, params = _splitparams(url)
372    else:
373        params = ''
374    result = ParseResult(scheme, netloc, url, params, query, fragment)
375    return _coerce_result(result)
376
377def _splitparams(url):
378    if '/'  in url:
379        i = url.find(';', url.rfind('/'))
380        if i < 0:
381            return url, ''
382    else:
383        i = url.find(';')
384    return url[:i], url[i+1:]
385
386def _splitnetloc(url, start=0):
387    delim = len(url)   # position of end of domain part of url, default is end
388    for c in '/?#':    # look for delimiters; the order is NOT important
389        wdelim = url.find(c, start)        # find first of this delim
390        if wdelim >= 0:                    # if found
391            delim = min(delim, wdelim)     # use earliest delim position
392    return url[start:delim], url[delim:]   # return (domain, rest)
393
394def _checknetloc(netloc):
395    if not netloc or netloc.isascii():
396        return
397    # looking for characters like \u2100 that expand to 'a/c'
398    # IDNA uses NFKC equivalence, so normalize for this check
399    import unicodedata
400    netloc2 = unicodedata.normalize('NFKC', netloc)
401    if netloc == netloc2:
402        return
403    _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
404    for c in '/?#@:':
405        if c in netloc2:
406            raise ValueError("netloc '" + netloc2 + "' contains invalid " +
407                             "characters under NFKC normalization")
408
409def urlsplit(url, scheme='', allow_fragments=True):
410    """Parse a URL into 5 components:
411    <scheme>://<netloc>/<path>?<query>#<fragment>
412    Return a 5-tuple: (scheme, netloc, path, query, fragment).
413    Note that we don't break the components up in smaller bits
414    (e.g. netloc is a single string) and we don't expand % escapes."""
415    url, scheme, _coerce_result = _coerce_args(url, scheme)
416    allow_fragments = bool(allow_fragments)
417    key = url, scheme, allow_fragments, type(url), type(scheme)
418    cached = _parse_cache.get(key, None)
419    if cached:
420        return _coerce_result(cached)
421    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
422        clear_cache()
423    netloc = query = fragment = ''
424    i = url.find(':')
425    if i > 0:
426        if url[:i] == 'http': # optimize the common case
427            url = url[i+1:]
428            if url[:2] == '//':
429                netloc, url = _splitnetloc(url, 2)
430                if (('[' in netloc and ']' not in netloc) or
431                        (']' in netloc and '[' not in netloc)):
432                    raise ValueError("Invalid IPv6 URL")
433            if allow_fragments and '#' in url:
434                url, fragment = url.split('#', 1)
435            if '?' in url:
436                url, query = url.split('?', 1)
437            _checknetloc(netloc)
438            v = SplitResult('http', netloc, url, query, fragment)
439            _parse_cache[key] = v
440            return _coerce_result(v)
441        for c in url[:i]:
442            if c not in scheme_chars:
443                break
444        else:
445            # make sure "url" is not actually a port number (in which case
446            # "scheme" is really part of the path)
447            rest = url[i+1:]
448            if not rest or any(c not in '0123456789' for c in rest):
449                # not a port number
450                scheme, url = url[:i].lower(), rest
451
452    if url[:2] == '//':
453        netloc, url = _splitnetloc(url, 2)
454        if (('[' in netloc and ']' not in netloc) or
455                (']' in netloc and '[' not in netloc)):
456            raise ValueError("Invalid IPv6 URL")
457    if allow_fragments and '#' in url:
458        url, fragment = url.split('#', 1)
459    if '?' in url:
460        url, query = url.split('?', 1)
461    _checknetloc(netloc)
462    v = SplitResult(scheme, netloc, url, query, fragment)
463    _parse_cache[key] = v
464    return _coerce_result(v)
465
466def urlunparse(components):
467    """Put a parsed URL back together again.  This may result in a
468    slightly different, but equivalent URL, if the URL that was parsed
469    originally had redundant delimiters, e.g. a ? with an empty query
470    (the draft states that these are equivalent)."""
471    scheme, netloc, url, params, query, fragment, _coerce_result = (
472                                                  _coerce_args(*components))
473    if params:
474        url = "%s;%s" % (url, params)
475    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
476
477def urlunsplit(components):
478    """Combine the elements of a tuple as returned by urlsplit() into a
479    complete URL as a string. The data argument can be any five-item iterable.
480    This may result in a slightly different, but equivalent URL, if the URL that
481    was parsed originally had unnecessary delimiters (for example, a ? with an
482    empty query; the RFC states that these are equivalent)."""
483    scheme, netloc, url, query, fragment, _coerce_result = (
484                                          _coerce_args(*components))
485    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
486        if url and url[:1] != '/': url = '/' + url
487        url = '//' + (netloc or '') + url
488    if scheme:
489        url = scheme + ':' + url
490    if query:
491        url = url + '?' + query
492    if fragment:
493        url = url + '#' + fragment
494    return _coerce_result(url)
495
496def urljoin(base, url, allow_fragments=True):
497    """Join a base URL and a possibly relative URL to form an absolute
498    interpretation of the latter."""
499    if not base:
500        return url
501    if not url:
502        return base
503
504    base, url, _coerce_result = _coerce_args(base, url)
505    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
506            urlparse(base, '', allow_fragments)
507    scheme, netloc, path, params, query, fragment = \
508            urlparse(url, bscheme, allow_fragments)
509
510    if scheme != bscheme or scheme not in uses_relative:
511        return _coerce_result(url)
512    if scheme in uses_netloc:
513        if netloc:
514            return _coerce_result(urlunparse((scheme, netloc, path,
515                                              params, query, fragment)))
516        netloc = bnetloc
517
518    if not path and not params:
519        path = bpath
520        params = bparams
521        if not query:
522            query = bquery
523        return _coerce_result(urlunparse((scheme, netloc, path,
524                                          params, query, fragment)))
525
526    base_parts = bpath.split('/')
527    if base_parts[-1] != '':
528        # the last item is not a directory, so will not be taken into account
529        # in resolving the relative path
530        del base_parts[-1]
531
532    # for rfc3986, ignore all base path should the first character be root.
533    if path[:1] == '/':
534        segments = path.split('/')
535    else:
536        segments = base_parts + path.split('/')
537        # filter out elements that would cause redundant slashes on re-joining
538        # the resolved_path
539        segments[1:-1] = filter(None, segments[1:-1])
540
541    resolved_path = []
542
543    for seg in segments:
544        if seg == '..':
545            try:
546                resolved_path.pop()
547            except IndexError:
548                # ignore any .. segments that would otherwise cause an IndexError
549                # when popped from resolved_path if resolving for rfc3986
550                pass
551        elif seg == '.':
552            continue
553        else:
554            resolved_path.append(seg)
555
556    if segments[-1] in ('.', '..'):
557        # do some post-processing here. if the last segment was a relative dir,
558        # then we need to append the trailing '/'
559        resolved_path.append('')
560
561    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
562        resolved_path) or '/', params, query, fragment)))
563
564
565def urldefrag(url):
566    """Removes any existing fragment from URL.
567
568    Returns a tuple of the defragmented URL and the fragment.  If
569    the URL contained no fragments, the second element is the
570    empty string.
571    """
572    url, _coerce_result = _coerce_args(url)
573    if '#' in url:
574        s, n, p, a, q, frag = urlparse(url)
575        defrag = urlunparse((s, n, p, a, q, ''))
576    else:
577        frag = ''
578        defrag = url
579    return _coerce_result(DefragResult(defrag, frag))
580
581_hexdig = '0123456789ABCDEFabcdef'
582_hextobyte = None
583
584def unquote_to_bytes(string):
585    """unquote_to_bytes('abc%20def') -> b'abc def'."""
586    # Note: strings are encoded as UTF-8. This is only an issue if it contains
587    # unescaped non-ASCII characters, which URIs should not.
588    if not string:
589        # Is it a string-like object?
590        string.split
591        return b''
592    if isinstance(string, str):
593        string = string.encode('utf-8')
594    bits = string.split(b'%')
595    if len(bits) == 1:
596        return string
597    res = [bits[0]]
598    append = res.append
599    # Delay the initialization of the table to not waste memory
600    # if the function is never called
601    global _hextobyte
602    if _hextobyte is None:
603        _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
604                      for a in _hexdig for b in _hexdig}
605    for item in bits[1:]:
606        try:
607            append(_hextobyte[item[:2]])
608            append(item[2:])
609        except KeyError:
610            append(b'%')
611            append(item)
612    return b''.join(res)
613
614_asciire = re.compile('([\x00-\x7f]+)')
615
616def unquote(string, encoding='utf-8', errors='replace'):
617    """Replace %xx escapes by their single-character equivalent. The optional
618    encoding and errors parameters specify how to decode percent-encoded
619    sequences into Unicode characters, as accepted by the bytes.decode()
620    method.
621    By default, percent-encoded sequences are decoded with UTF-8, and invalid
622    sequences are replaced by a placeholder character.
623
624    unquote('abc%20def') -> 'abc def'.
625    """
626    if '%' not in string:
627        string.split
628        return string
629    if encoding is None:
630        encoding = 'utf-8'
631    if errors is None:
632        errors = 'replace'
633    bits = _asciire.split(string)
634    res = [bits[0]]
635    append = res.append
636    for i in range(1, len(bits), 2):
637        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
638        append(bits[i + 1])
639    return ''.join(res)
640
641
642def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
643             encoding='utf-8', errors='replace', max_num_fields=None):
644    """Parse a query given as a string argument.
645
646        Arguments:
647
648        qs: percent-encoded query string to be parsed
649
650        keep_blank_values: flag indicating whether blank values in
651            percent-encoded queries should be treated as blank strings.
652            A true value indicates that blanks should be retained as
653            blank strings.  The default false value indicates that
654            blank values are to be ignored and treated as if they were
655            not included.
656
657        strict_parsing: flag indicating what to do with parsing errors.
658            If false (the default), errors are silently ignored.
659            If true, errors raise a ValueError exception.
660
661        encoding and errors: specify how to decode percent-encoded sequences
662            into Unicode characters, as accepted by the bytes.decode() method.
663
664        max_num_fields: int. If set, then throws a ValueError if there
665            are more than n fields read by parse_qsl().
666
667        Returns a dictionary.
668    """
669    parsed_result = {}
670    pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
671                      encoding=encoding, errors=errors,
672                      max_num_fields=max_num_fields)
673    for name, value in pairs:
674        if name in parsed_result:
675            parsed_result[name].append(value)
676        else:
677            parsed_result[name] = [value]
678    return parsed_result
679
680
681def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
682              encoding='utf-8', errors='replace', max_num_fields=None):
683    """Parse a query given as a string argument.
684
685        Arguments:
686
687        qs: percent-encoded query string to be parsed
688
689        keep_blank_values: flag indicating whether blank values in
690            percent-encoded queries should be treated as blank strings.
691            A true value indicates that blanks should be retained as blank
692            strings.  The default false value indicates that blank values
693            are to be ignored and treated as if they were  not included.
694
695        strict_parsing: flag indicating what to do with parsing errors. If
696            false (the default), errors are silently ignored. If true,
697            errors raise a ValueError exception.
698
699        encoding and errors: specify how to decode percent-encoded sequences
700            into Unicode characters, as accepted by the bytes.decode() method.
701
702        max_num_fields: int. If set, then throws a ValueError
703            if there are more than n fields read by parse_qsl().
704
705        Returns a list, as G-d intended.
706    """
707    qs, _coerce_result = _coerce_args(qs)
708
709    # If max_num_fields is defined then check that the number of fields
710    # is less than max_num_fields. This prevents a memory exhaustion DOS
711    # attack via post bodies with many fields.
712    if max_num_fields is not None:
713        num_fields = 1 + qs.count('&') + qs.count(';')
714        if max_num_fields < num_fields:
715            raise ValueError('Max number of fields exceeded')
716
717    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
718    r = []
719    for name_value in pairs:
720        if not name_value and not strict_parsing:
721            continue
722        nv = name_value.split('=', 1)
723        if len(nv) != 2:
724            if strict_parsing:
725                raise ValueError("bad query field: %r" % (name_value,))
726            # Handle case of a control-name with no equal sign
727            if keep_blank_values:
728                nv.append('')
729            else:
730                continue
731        if len(nv[1]) or keep_blank_values:
732            name = nv[0].replace('+', ' ')
733            name = unquote(name, encoding=encoding, errors=errors)
734            name = _coerce_result(name)
735            value = nv[1].replace('+', ' ')
736            value = unquote(value, encoding=encoding, errors=errors)
737            value = _coerce_result(value)
738            r.append((name, value))
739    return r
740
741def unquote_plus(string, encoding='utf-8', errors='replace'):
742    """Like unquote(), but also replace plus signs by spaces, as required for
743    unquoting HTML form values.
744
745    unquote_plus('%7e/abc+def') -> '~/abc def'
746    """
747    string = string.replace('+', ' ')
748    return unquote(string, encoding, errors)
749
750_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
751                         b'abcdefghijklmnopqrstuvwxyz'
752                         b'0123456789'
753                         b'_.-~')
754_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
755_safe_quoters = {}
756
757class Quoter(collections.defaultdict):
758    """A mapping from bytes (in range(0,256)) to strings.
759
760    String values are percent-encoded byte values, unless the key < 128, and
761    in the "safe" set (either the specified safe set, or default set).
762    """
763    # Keeps a cache internally, using defaultdict, for efficiency (lookups
764    # of cached keys don't call Python code at all).
765    def __init__(self, safe):
766        """safe: bytes object."""
767        self.safe = _ALWAYS_SAFE.union(safe)
768
769    def __repr__(self):
770        # Without this, will just display as a defaultdict
771        return "<%s %r>" % (self.__class__.__name__, dict(self))
772
773    def __missing__(self, b):
774        # Handle a cache miss. Store quoted string in cache and return.
775        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
776        self[b] = res
777        return res
778
779def quote(string, safe='/', encoding=None, errors=None):
780    """quote('abc def') -> 'abc%20def'
781
782    Each part of a URL, e.g. the path info, the query, etc., has a
783    different set of reserved characters that must be quoted.
784
785    RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists
786    the following reserved characters.
787
788    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
789                  "$" | "," | "~"
790
791    Each of these characters is reserved in some component of a URL,
792    but not necessarily in all of them.
793
794    Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
795    Now, "~" is included in the set of reserved characters.
796
797    By default, the quote function is intended for quoting the path
798    section of a URL.  Thus, it will not encode '/'.  This character
799    is reserved, but in typical usage the quote function is being
800    called on a path where the existing slash characters are used as
801    reserved characters.
802
803    string and safe may be either str or bytes objects. encoding and errors
804    must not be specified if string is a bytes object.
805
806    The optional encoding and errors parameters specify how to deal with
807    non-ASCII characters, as accepted by the str.encode method.
808    By default, encoding='utf-8' (characters are encoded with UTF-8), and
809    errors='strict' (unsupported characters raise a UnicodeEncodeError).
810    """
811    if isinstance(string, str):
812        if not string:
813            return string
814        if encoding is None:
815            encoding = 'utf-8'
816        if errors is None:
817            errors = 'strict'
818        string = string.encode(encoding, errors)
819    else:
820        if encoding is not None:
821            raise TypeError("quote() doesn't support 'encoding' for bytes")
822        if errors is not None:
823            raise TypeError("quote() doesn't support 'errors' for bytes")
824    return quote_from_bytes(string, safe)
825
826def quote_plus(string, safe='', encoding=None, errors=None):
827    """Like quote(), but also replace ' ' with '+', as required for quoting
828    HTML form values. Plus signs in the original string are escaped unless
829    they are included in safe. It also does not have safe default to '/'.
830    """
831    # Check if ' ' in string, where string may either be a str or bytes.  If
832    # there are no spaces, the regular quote will produce the right answer.
833    if ((isinstance(string, str) and ' ' not in string) or
834        (isinstance(string, bytes) and b' ' not in string)):
835        return quote(string, safe, encoding, errors)
836    if isinstance(safe, str):
837        space = ' '
838    else:
839        space = b' '
840    string = quote(string, safe + space, encoding, errors)
841    return string.replace(' ', '+')
842
843def quote_from_bytes(bs, safe='/'):
844    """Like quote(), but accepts a bytes object rather than a str, and does
845    not perform string-to-bytes encoding.  It always returns an ASCII string.
846    quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
847    """
848    if not isinstance(bs, (bytes, bytearray)):
849        raise TypeError("quote_from_bytes() expected bytes")
850    if not bs:
851        return ''
852    if isinstance(safe, str):
853        # Normalize 'safe' by converting to bytes and removing non-ASCII chars
854        safe = safe.encode('ascii', 'ignore')
855    else:
856        safe = bytes([c for c in safe if c < 128])
857    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
858        return bs.decode()
859    try:
860        quoter = _safe_quoters[safe]
861    except KeyError:
862        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
863    return ''.join([quoter(char) for char in bs])
864
865def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
866              quote_via=quote_plus):
867    """Encode a dict or sequence of two-element tuples into a URL query string.
868
869    If any values in the query arg are sequences and doseq is true, each
870    sequence element is converted to a separate parameter.
871
872    If the query arg is a sequence of two-element tuples, the order of the
873    parameters in the output will match the order of parameters in the
874    input.
875
876    The components of a query arg may each be either a string or a bytes type.
877
878    The safe, encoding, and errors parameters are passed down to the function
879    specified by quote_via (encoding and errors only if a component is a str).
880    """
881
882    if hasattr(query, "items"):
883        query = query.items()
884    else:
885        # It's a bother at times that strings and string-like objects are
886        # sequences.
887        try:
888            # non-sequence items should not work with len()
889            # non-empty strings will fail this
890            if len(query) and not isinstance(query[0], tuple):
891                raise TypeError
892            # Zero-length sequences of all types will get here and succeed,
893            # but that's a minor nit.  Since the original implementation
894            # allowed empty dicts that type of behavior probably should be
895            # preserved for consistency
896        except TypeError:
897            ty, va, tb = sys.exc_info()
898            raise TypeError("not a valid non-string sequence "
899                            "or mapping object").with_traceback(tb)
900
901    l = []
902    if not doseq:
903        for k, v in query:
904            if isinstance(k, bytes):
905                k = quote_via(k, safe)
906            else:
907                k = quote_via(str(k), safe, encoding, errors)
908
909            if isinstance(v, bytes):
910                v = quote_via(v, safe)
911            else:
912                v = quote_via(str(v), safe, encoding, errors)
913            l.append(k + '=' + v)
914    else:
915        for k, v in query:
916            if isinstance(k, bytes):
917                k = quote_via(k, safe)
918            else:
919                k = quote_via(str(k), safe, encoding, errors)
920
921            if isinstance(v, bytes):
922                v = quote_via(v, safe)
923                l.append(k + '=' + v)
924            elif isinstance(v, str):
925                v = quote_via(v, safe, encoding, errors)
926                l.append(k + '=' + v)
927            else:
928                try:
929                    # Is this a sufficient test for sequence-ness?
930                    x = len(v)
931                except TypeError:
932                    # not a sequence
933                    v = quote_via(str(v), safe, encoding, errors)
934                    l.append(k + '=' + v)
935                else:
936                    # loop over the sequence
937                    for elt in v:
938                        if isinstance(elt, bytes):
939                            elt = quote_via(elt, safe)
940                        else:
941                            elt = quote_via(str(elt), safe, encoding, errors)
942                        l.append(k + '=' + elt)
943    return '&'.join(l)
944
945def to_bytes(url):
946    """to_bytes(u"URL") --> 'URL'."""
947    # Most URL schemes require ASCII. If that changes, the conversion
948    # can be relaxed.
949    # XXX get rid of to_bytes()
950    if isinstance(url, str):
951        try:
952            url = url.encode("ASCII").decode()
953        except UnicodeError:
954            raise UnicodeError("URL " + repr(url) +
955                               " contains non-ASCII characters")
956    return url
957
958def unwrap(url):
959    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
960    url = str(url).strip()
961    if url[:1] == '<' and url[-1:] == '>':
962        url = url[1:-1].strip()
963    if url[:4] == 'URL:': url = url[4:].strip()
964    return url
965
966_typeprog = None
967def splittype(url):
968    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
969    global _typeprog
970    if _typeprog is None:
971        _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
972
973    match = _typeprog.match(url)
974    if match:
975        scheme, data = match.groups()
976        return scheme.lower(), data
977    return None, url
978
979_hostprog = None
980def splithost(url):
981    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
982    global _hostprog
983    if _hostprog is None:
984        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
985
986    match = _hostprog.match(url)
987    if match:
988        host_port, path = match.groups()
989        if path and path[0] != '/':
990            path = '/' + path
991        return host_port, path
992    return None, url
993
994def splituser(host):
995    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
996    user, delim, host = host.rpartition('@')
997    return (user if delim else None), host
998
999def splitpasswd(user):
1000    """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1001    user, delim, passwd = user.partition(':')
1002    return user, (passwd if delim else None)
1003
1004# splittag('/path#tag') --> '/path', 'tag'
1005_portprog = None
1006def splitport(host):
1007    """splitport('host:port') --> 'host', 'port'."""
1008    global _portprog
1009    if _portprog is None:
1010        _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
1011
1012    match = _portprog.match(host)
1013    if match:
1014        host, port = match.groups()
1015        if port:
1016            return host, port
1017    return host, None
1018
1019def splitnport(host, defport=-1):
1020    """Split host and port, returning numeric port.
1021    Return given default port if no ':' found; defaults to -1.
1022    Return numerical port if a valid number are found after ':'.
1023    Return None if ':' but not a valid number."""
1024    host, delim, port = host.rpartition(':')
1025    if not delim:
1026        host = port
1027    elif port:
1028        try:
1029            nport = int(port)
1030        except ValueError:
1031            nport = None
1032        return host, nport
1033    return host, defport
1034
1035def splitquery(url):
1036    """splitquery('/path?query') --> '/path', 'query'."""
1037    path, delim, query = url.rpartition('?')
1038    if delim:
1039        return path, query
1040    return url, None
1041
1042def splittag(url):
1043    """splittag('/path#tag') --> '/path', 'tag'."""
1044    path, delim, tag = url.rpartition('#')
1045    if delim:
1046        return path, tag
1047    return url, None
1048
1049def splitattr(url):
1050    """splitattr('/path;attr1=value1;attr2=value2;...') ->
1051        '/path', ['attr1=value1', 'attr2=value2', ...]."""
1052    words = url.split(';')
1053    return words[0], words[1:]
1054
1055def splitvalue(attr):
1056    """splitvalue('attr=value') --> 'attr', 'value'."""
1057    attr, delim, value = attr.partition('=')
1058    return attr, (value if delim else None)
1059