1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28""" 29 30import re 31import sys 32import types 33import collections 34import warnings 35 36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 37 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 39 "unquote", "unquote_plus", "unquote_to_bytes", 40 "DefragResult", "ParseResult", "SplitResult", 41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 42 43# A classification of schemes. 44# The empty string classifies URLs with no scheme specified, 45# being the default value returned by “urlsplit” and “urlparse”. 46 47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 48 'wais', 'file', 'https', 'shttp', 'mms', 49 'prospero', 'rtsp', 'rtspu', 'sftp', 50 'svn', 'svn+ssh', 'ws', 'wss'] 51 52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 53 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 54 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 55 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 56 'ws', 'wss'] 57 58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 59 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 60 'mms', 'sftp', 'tel'] 61 62# These are not actually used anymore, but should stay for backwards 63# compatibility. (They are undocumented, but have a public-looking name.) 64 65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 67 68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 70 71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 72 'nntp', 'wais', 'https', 'shttp', 'snews', 73 'file', 'prospero'] 74 75# Characters valid in scheme names 76scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 78 '0123456789' 79 '+-.') 80 81# Unsafe bytes to be removed per WHATWG spec 82_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] 83 84# XXX: Consider replacing with functools.lru_cache 85MAX_CACHE_SIZE = 20 86_parse_cache = {} 87 88def clear_cache(): 89 """Clear the parse cache and the quoters cache.""" 90 _parse_cache.clear() 91 _safe_quoters.clear() 92 93 94# Helpers for bytes handling 95# For 3.2, we deliberately require applications that 96# handle improperly quoted URLs to do their own 97# decoding and encoding. If valid use cases are 98# presented, we may relax this by using latin-1 99# decoding internally for 3.3 100_implicit_encoding = 'ascii' 101_implicit_errors = 'strict' 102 103def _noop(obj): 104 return obj 105 106def _encode_result(obj, encoding=_implicit_encoding, 107 errors=_implicit_errors): 108 return obj.encode(encoding, errors) 109 110def _decode_args(args, encoding=_implicit_encoding, 111 errors=_implicit_errors): 112 return tuple(x.decode(encoding, errors) if x else '' for x in args) 113 114def _coerce_args(*args): 115 # Invokes decode if necessary to create str args 116 # and returns the coerced inputs along with 117 # an appropriate result coercion function 118 # - noop for str inputs 119 # - encoding function otherwise 120 str_input = isinstance(args[0], str) 121 for arg in args[1:]: 122 # We special-case the empty string to support the 123 # "scheme=''" default argument to some functions 124 if arg and isinstance(arg, str) != str_input: 125 raise TypeError("Cannot mix str and non-str arguments") 126 if str_input: 127 return args + (_noop,) 128 return _decode_args(args) + (_encode_result,) 129 130# Result objects are more helpful than simple tuples 131class _ResultMixinStr(object): 132 """Standard approach to encoding parsed results from str to bytes""" 133 __slots__ = () 134 135 def encode(self, encoding='ascii', errors='strict'): 136 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 137 138 139class _ResultMixinBytes(object): 140 """Standard approach to decoding parsed results from bytes to str""" 141 __slots__ = () 142 143 def decode(self, encoding='ascii', errors='strict'): 144 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 145 146 147class _NetlocResultMixinBase(object): 148 """Shared methods for the parsed result objects containing a netloc element""" 149 __slots__ = () 150 151 @property 152 def username(self): 153 return self._userinfo[0] 154 155 @property 156 def password(self): 157 return self._userinfo[1] 158 159 @property 160 def hostname(self): 161 hostname = self._hostinfo[0] 162 if not hostname: 163 return None 164 # Scoped IPv6 address may have zone info, which must not be lowercased 165 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 166 separator = '%' if isinstance(hostname, str) else b'%' 167 hostname, percent, zone = hostname.partition(separator) 168 return hostname.lower() + percent + zone 169 170 @property 171 def port(self): 172 port = self._hostinfo[1] 173 if port is not None: 174 try: 175 port = int(port, 10) 176 except ValueError: 177 message = f'Port could not be cast to integer value as {port!r}' 178 raise ValueError(message) from None 179 if not ( 0 <= port <= 65535): 180 raise ValueError("Port out of range 0-65535") 181 return port 182 183 __class_getitem__ = classmethod(types.GenericAlias) 184 185 186class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 187 __slots__ = () 188 189 @property 190 def _userinfo(self): 191 netloc = self.netloc 192 userinfo, have_info, hostinfo = netloc.rpartition('@') 193 if have_info: 194 username, have_password, password = userinfo.partition(':') 195 if not have_password: 196 password = None 197 else: 198 username = password = None 199 return username, password 200 201 @property 202 def _hostinfo(self): 203 netloc = self.netloc 204 _, _, hostinfo = netloc.rpartition('@') 205 _, have_open_br, bracketed = hostinfo.partition('[') 206 if have_open_br: 207 hostname, _, port = bracketed.partition(']') 208 _, _, port = port.partition(':') 209 else: 210 hostname, _, port = hostinfo.partition(':') 211 if not port: 212 port = None 213 return hostname, port 214 215 216class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 217 __slots__ = () 218 219 @property 220 def _userinfo(self): 221 netloc = self.netloc 222 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 223 if have_info: 224 username, have_password, password = userinfo.partition(b':') 225 if not have_password: 226 password = None 227 else: 228 username = password = None 229 return username, password 230 231 @property 232 def _hostinfo(self): 233 netloc = self.netloc 234 _, _, hostinfo = netloc.rpartition(b'@') 235 _, have_open_br, bracketed = hostinfo.partition(b'[') 236 if have_open_br: 237 hostname, _, port = bracketed.partition(b']') 238 _, _, port = port.partition(b':') 239 else: 240 hostname, _, port = hostinfo.partition(b':') 241 if not port: 242 port = None 243 return hostname, port 244 245 246from collections import namedtuple 247 248_DefragResultBase = namedtuple('DefragResult', 'url fragment') 249_SplitResultBase = namedtuple( 250 'SplitResult', 'scheme netloc path query fragment') 251_ParseResultBase = namedtuple( 252 'ParseResult', 'scheme netloc path params query fragment') 253 254_DefragResultBase.__doc__ = """ 255DefragResult(url, fragment) 256 257A 2-tuple that contains the url without fragment identifier and the fragment 258identifier as a separate argument. 259""" 260 261_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 262 263_DefragResultBase.fragment.__doc__ = """ 264Fragment identifier separated from URL, that allows indirect identification of a 265secondary resource by reference to a primary resource and additional identifying 266information. 267""" 268 269_SplitResultBase.__doc__ = """ 270SplitResult(scheme, netloc, path, query, fragment) 271 272A 5-tuple that contains the different components of a URL. Similar to 273ParseResult, but does not split params. 274""" 275 276_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 277 278_SplitResultBase.netloc.__doc__ = """ 279Network location where the request is made to. 280""" 281 282_SplitResultBase.path.__doc__ = """ 283The hierarchical path, such as the path to a file to download. 284""" 285 286_SplitResultBase.query.__doc__ = """ 287The query component, that contains non-hierarchical data, that along with data 288in path component, identifies a resource in the scope of URI's scheme and 289network location. 290""" 291 292_SplitResultBase.fragment.__doc__ = """ 293Fragment identifier, that allows indirect identification of a secondary resource 294by reference to a primary resource and additional identifying information. 295""" 296 297_ParseResultBase.__doc__ = """ 298ParseResult(scheme, netloc, path, params, query, fragment) 299 300A 6-tuple that contains components of a parsed URL. 301""" 302 303_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 304_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 305_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 306_ParseResultBase.params.__doc__ = """ 307Parameters for last path element used to dereference the URI in order to provide 308access to perform some operation on the resource. 309""" 310 311_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 312_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 313 314 315# For backwards compatibility, alias _NetlocResultMixinStr 316# ResultBase is no longer part of the documented API, but it is 317# retained since deprecating it isn't worth the hassle 318ResultBase = _NetlocResultMixinStr 319 320# Structured result objects for string data 321class DefragResult(_DefragResultBase, _ResultMixinStr): 322 __slots__ = () 323 def geturl(self): 324 if self.fragment: 325 return self.url + '#' + self.fragment 326 else: 327 return self.url 328 329class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 330 __slots__ = () 331 def geturl(self): 332 return urlunsplit(self) 333 334class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 335 __slots__ = () 336 def geturl(self): 337 return urlunparse(self) 338 339# Structured result objects for bytes data 340class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 341 __slots__ = () 342 def geturl(self): 343 if self.fragment: 344 return self.url + b'#' + self.fragment 345 else: 346 return self.url 347 348class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 349 __slots__ = () 350 def geturl(self): 351 return urlunsplit(self) 352 353class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 354 __slots__ = () 355 def geturl(self): 356 return urlunparse(self) 357 358# Set up the encode/decode result pairs 359def _fix_result_transcoding(): 360 _result_pairs = ( 361 (DefragResult, DefragResultBytes), 362 (SplitResult, SplitResultBytes), 363 (ParseResult, ParseResultBytes), 364 ) 365 for _decoded, _encoded in _result_pairs: 366 _decoded._encoded_counterpart = _encoded 367 _encoded._decoded_counterpart = _decoded 368 369_fix_result_transcoding() 370del _fix_result_transcoding 371 372def urlparse(url, scheme='', allow_fragments=True): 373 """Parse a URL into 6 components: 374 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 375 376 The result is a named 6-tuple with fields corresponding to the 377 above. It is either a ParseResult or ParseResultBytes object, 378 depending on the type of the url parameter. 379 380 The username, password, hostname, and port sub-components of netloc 381 can also be accessed as attributes of the returned object. 382 383 The scheme argument provides the default value of the scheme 384 component when no scheme is found in url. 385 386 If allow_fragments is False, no attempt is made to separate the 387 fragment component from the previous component, which can be either 388 path or query. 389 390 Note that % escapes are not expanded. 391 """ 392 url, scheme, _coerce_result = _coerce_args(url, scheme) 393 splitresult = urlsplit(url, scheme, allow_fragments) 394 scheme, netloc, url, query, fragment = splitresult 395 if scheme in uses_params and ';' in url: 396 url, params = _splitparams(url) 397 else: 398 params = '' 399 result = ParseResult(scheme, netloc, url, params, query, fragment) 400 return _coerce_result(result) 401 402def _splitparams(url): 403 if '/' in url: 404 i = url.find(';', url.rfind('/')) 405 if i < 0: 406 return url, '' 407 else: 408 i = url.find(';') 409 return url[:i], url[i+1:] 410 411def _splitnetloc(url, start=0): 412 delim = len(url) # position of end of domain part of url, default is end 413 for c in '/?#': # look for delimiters; the order is NOT important 414 wdelim = url.find(c, start) # find first of this delim 415 if wdelim >= 0: # if found 416 delim = min(delim, wdelim) # use earliest delim position 417 return url[start:delim], url[delim:] # return (domain, rest) 418 419def _checknetloc(netloc): 420 if not netloc or netloc.isascii(): 421 return 422 # looking for characters like \u2100 that expand to 'a/c' 423 # IDNA uses NFKC equivalence, so normalize for this check 424 import unicodedata 425 n = netloc.replace('@', '') # ignore characters already included 426 n = n.replace(':', '') # but not the surrounding text 427 n = n.replace('#', '') 428 n = n.replace('?', '') 429 netloc2 = unicodedata.normalize('NFKC', n) 430 if n == netloc2: 431 return 432 for c in '/?#@:': 433 if c in netloc2: 434 raise ValueError("netloc '" + netloc + "' contains invalid " + 435 "characters under NFKC normalization") 436 437def urlsplit(url, scheme='', allow_fragments=True): 438 """Parse a URL into 5 components: 439 <scheme>://<netloc>/<path>?<query>#<fragment> 440 441 The result is a named 5-tuple with fields corresponding to the 442 above. It is either a SplitResult or SplitResultBytes object, 443 depending on the type of the url parameter. 444 445 The username, password, hostname, and port sub-components of netloc 446 can also be accessed as attributes of the returned object. 447 448 The scheme argument provides the default value of the scheme 449 component when no scheme is found in url. 450 451 If allow_fragments is False, no attempt is made to separate the 452 fragment component from the previous component, which can be either 453 path or query. 454 455 Note that % escapes are not expanded. 456 """ 457 458 url, scheme, _coerce_result = _coerce_args(url, scheme) 459 allow_fragments = bool(allow_fragments) 460 key = url, scheme, allow_fragments, type(url), type(scheme) 461 cached = _parse_cache.get(key, None) 462 if cached: 463 return _coerce_result(cached) 464 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 465 clear_cache() 466 netloc = query = fragment = '' 467 i = url.find(':') 468 if i > 0 and url[0].isascii() and url[0].isalpha(): 469 for c in url[:i]: 470 if c not in scheme_chars: 471 break 472 else: 473 scheme, url = url[:i].lower(), url[i+1:] 474 475 for b in _UNSAFE_URL_BYTES_TO_REMOVE: 476 url = url.replace(b, "") 477 478 if url[:2] == '//': 479 netloc, url = _splitnetloc(url, 2) 480 if (('[' in netloc and ']' not in netloc) or 481 (']' in netloc and '[' not in netloc)): 482 raise ValueError("Invalid IPv6 URL") 483 if allow_fragments and '#' in url: 484 url, fragment = url.split('#', 1) 485 if '?' in url: 486 url, query = url.split('?', 1) 487 _checknetloc(netloc) 488 v = SplitResult(scheme, netloc, url, query, fragment) 489 _parse_cache[key] = v 490 return _coerce_result(v) 491 492def urlunparse(components): 493 """Put a parsed URL back together again. This may result in a 494 slightly different, but equivalent URL, if the URL that was parsed 495 originally had redundant delimiters, e.g. a ? with an empty query 496 (the draft states that these are equivalent).""" 497 scheme, netloc, url, params, query, fragment, _coerce_result = ( 498 _coerce_args(*components)) 499 if params: 500 url = "%s;%s" % (url, params) 501 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 502 503def urlunsplit(components): 504 """Combine the elements of a tuple as returned by urlsplit() into a 505 complete URL as a string. The data argument can be any five-item iterable. 506 This may result in a slightly different, but equivalent URL, if the URL that 507 was parsed originally had unnecessary delimiters (for example, a ? with an 508 empty query; the RFC states that these are equivalent).""" 509 scheme, netloc, url, query, fragment, _coerce_result = ( 510 _coerce_args(*components)) 511 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 512 if url and url[:1] != '/': url = '/' + url 513 url = '//' + (netloc or '') + url 514 if scheme: 515 url = scheme + ':' + url 516 if query: 517 url = url + '?' + query 518 if fragment: 519 url = url + '#' + fragment 520 return _coerce_result(url) 521 522def urljoin(base, url, allow_fragments=True): 523 """Join a base URL and a possibly relative URL to form an absolute 524 interpretation of the latter.""" 525 if not base: 526 return url 527 if not url: 528 return base 529 530 base, url, _coerce_result = _coerce_args(base, url) 531 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 532 urlparse(base, '', allow_fragments) 533 scheme, netloc, path, params, query, fragment = \ 534 urlparse(url, bscheme, allow_fragments) 535 536 if scheme != bscheme or scheme not in uses_relative: 537 return _coerce_result(url) 538 if scheme in uses_netloc: 539 if netloc: 540 return _coerce_result(urlunparse((scheme, netloc, path, 541 params, query, fragment))) 542 netloc = bnetloc 543 544 if not path and not params: 545 path = bpath 546 params = bparams 547 if not query: 548 query = bquery 549 return _coerce_result(urlunparse((scheme, netloc, path, 550 params, query, fragment))) 551 552 base_parts = bpath.split('/') 553 if base_parts[-1] != '': 554 # the last item is not a directory, so will not be taken into account 555 # in resolving the relative path 556 del base_parts[-1] 557 558 # for rfc3986, ignore all base path should the first character be root. 559 if path[:1] == '/': 560 segments = path.split('/') 561 else: 562 segments = base_parts + path.split('/') 563 # filter out elements that would cause redundant slashes on re-joining 564 # the resolved_path 565 segments[1:-1] = filter(None, segments[1:-1]) 566 567 resolved_path = [] 568 569 for seg in segments: 570 if seg == '..': 571 try: 572 resolved_path.pop() 573 except IndexError: 574 # ignore any .. segments that would otherwise cause an IndexError 575 # when popped from resolved_path if resolving for rfc3986 576 pass 577 elif seg == '.': 578 continue 579 else: 580 resolved_path.append(seg) 581 582 if segments[-1] in ('.', '..'): 583 # do some post-processing here. if the last segment was a relative dir, 584 # then we need to append the trailing '/' 585 resolved_path.append('') 586 587 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 588 resolved_path) or '/', params, query, fragment))) 589 590 591def urldefrag(url): 592 """Removes any existing fragment from URL. 593 594 Returns a tuple of the defragmented URL and the fragment. If 595 the URL contained no fragments, the second element is the 596 empty string. 597 """ 598 url, _coerce_result = _coerce_args(url) 599 if '#' in url: 600 s, n, p, a, q, frag = urlparse(url) 601 defrag = urlunparse((s, n, p, a, q, '')) 602 else: 603 frag = '' 604 defrag = url 605 return _coerce_result(DefragResult(defrag, frag)) 606 607_hexdig = '0123456789ABCDEFabcdef' 608_hextobyte = None 609 610def unquote_to_bytes(string): 611 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 612 # Note: strings are encoded as UTF-8. This is only an issue if it contains 613 # unescaped non-ASCII characters, which URIs should not. 614 if not string: 615 # Is it a string-like object? 616 string.split 617 return b'' 618 if isinstance(string, str): 619 string = string.encode('utf-8') 620 bits = string.split(b'%') 621 if len(bits) == 1: 622 return string 623 res = [bits[0]] 624 append = res.append 625 # Delay the initialization of the table to not waste memory 626 # if the function is never called 627 global _hextobyte 628 if _hextobyte is None: 629 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) 630 for a in _hexdig for b in _hexdig} 631 for item in bits[1:]: 632 try: 633 append(_hextobyte[item[:2]]) 634 append(item[2:]) 635 except KeyError: 636 append(b'%') 637 append(item) 638 return b''.join(res) 639 640_asciire = re.compile('([\x00-\x7f]+)') 641 642def unquote(string, encoding='utf-8', errors='replace'): 643 """Replace %xx escapes by their single-character equivalent. The optional 644 encoding and errors parameters specify how to decode percent-encoded 645 sequences into Unicode characters, as accepted by the bytes.decode() 646 method. 647 By default, percent-encoded sequences are decoded with UTF-8, and invalid 648 sequences are replaced by a placeholder character. 649 650 unquote('abc%20def') -> 'abc def'. 651 """ 652 if isinstance(string, bytes): 653 return unquote_to_bytes(string).decode(encoding, errors) 654 if '%' not in string: 655 string.split 656 return string 657 if encoding is None: 658 encoding = 'utf-8' 659 if errors is None: 660 errors = 'replace' 661 bits = _asciire.split(string) 662 res = [bits[0]] 663 append = res.append 664 for i in range(1, len(bits), 2): 665 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 666 append(bits[i + 1]) 667 return ''.join(res) 668 669 670def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 671 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 672 """Parse a query given as a string argument. 673 674 Arguments: 675 676 qs: percent-encoded query string to be parsed 677 678 keep_blank_values: flag indicating whether blank values in 679 percent-encoded queries should be treated as blank strings. 680 A true value indicates that blanks should be retained as 681 blank strings. The default false value indicates that 682 blank values are to be ignored and treated as if they were 683 not included. 684 685 strict_parsing: flag indicating what to do with parsing errors. 686 If false (the default), errors are silently ignored. 687 If true, errors raise a ValueError exception. 688 689 encoding and errors: specify how to decode percent-encoded sequences 690 into Unicode characters, as accepted by the bytes.decode() method. 691 692 max_num_fields: int. If set, then throws a ValueError if there 693 are more than n fields read by parse_qsl(). 694 695 separator: str. The symbol to use for separating the query arguments. 696 Defaults to &. 697 698 Returns a dictionary. 699 """ 700 parsed_result = {} 701 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 702 encoding=encoding, errors=errors, 703 max_num_fields=max_num_fields, separator=separator) 704 for name, value in pairs: 705 if name in parsed_result: 706 parsed_result[name].append(value) 707 else: 708 parsed_result[name] = [value] 709 return parsed_result 710 711 712def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 713 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 714 """Parse a query given as a string argument. 715 716 Arguments: 717 718 qs: percent-encoded query string to be parsed 719 720 keep_blank_values: flag indicating whether blank values in 721 percent-encoded queries should be treated as blank strings. 722 A true value indicates that blanks should be retained as blank 723 strings. The default false value indicates that blank values 724 are to be ignored and treated as if they were not included. 725 726 strict_parsing: flag indicating what to do with parsing errors. If 727 false (the default), errors are silently ignored. If true, 728 errors raise a ValueError exception. 729 730 encoding and errors: specify how to decode percent-encoded sequences 731 into Unicode characters, as accepted by the bytes.decode() method. 732 733 max_num_fields: int. If set, then throws a ValueError 734 if there are more than n fields read by parse_qsl(). 735 736 separator: str. The symbol to use for separating the query arguments. 737 Defaults to &. 738 739 Returns a list, as G-d intended. 740 """ 741 qs, _coerce_result = _coerce_args(qs) 742 separator, _ = _coerce_args(separator) 743 744 if not separator or (not isinstance(separator, (str, bytes))): 745 raise ValueError("Separator must be of type string or bytes.") 746 747 # If max_num_fields is defined then check that the number of fields 748 # is less than max_num_fields. This prevents a memory exhaustion DOS 749 # attack via post bodies with many fields. 750 if max_num_fields is not None: 751 num_fields = 1 + qs.count(separator) 752 if max_num_fields < num_fields: 753 raise ValueError('Max number of fields exceeded') 754 755 r = [] 756 for name_value in qs.split(separator): 757 if not name_value and not strict_parsing: 758 continue 759 nv = name_value.split('=', 1) 760 if len(nv) != 2: 761 if strict_parsing: 762 raise ValueError("bad query field: %r" % (name_value,)) 763 # Handle case of a control-name with no equal sign 764 if keep_blank_values: 765 nv.append('') 766 else: 767 continue 768 if len(nv[1]) or keep_blank_values: 769 name = nv[0].replace('+', ' ') 770 name = unquote(name, encoding=encoding, errors=errors) 771 name = _coerce_result(name) 772 value = nv[1].replace('+', ' ') 773 value = unquote(value, encoding=encoding, errors=errors) 774 value = _coerce_result(value) 775 r.append((name, value)) 776 return r 777 778def unquote_plus(string, encoding='utf-8', errors='replace'): 779 """Like unquote(), but also replace plus signs by spaces, as required for 780 unquoting HTML form values. 781 782 unquote_plus('%7e/abc+def') -> '~/abc def' 783 """ 784 string = string.replace('+', ' ') 785 return unquote(string, encoding, errors) 786 787_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 788 b'abcdefghijklmnopqrstuvwxyz' 789 b'0123456789' 790 b'_.-~') 791_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 792_safe_quoters = {} 793 794class Quoter(collections.defaultdict): 795 """A mapping from bytes (in range(0,256)) to strings. 796 797 String values are percent-encoded byte values, unless the key < 128, and 798 in the "safe" set (either the specified safe set, or default set). 799 """ 800 # Keeps a cache internally, using defaultdict, for efficiency (lookups 801 # of cached keys don't call Python code at all). 802 def __init__(self, safe): 803 """safe: bytes object.""" 804 self.safe = _ALWAYS_SAFE.union(safe) 805 806 def __repr__(self): 807 # Without this, will just display as a defaultdict 808 return "<%s %r>" % (self.__class__.__name__, dict(self)) 809 810 def __missing__(self, b): 811 # Handle a cache miss. Store quoted string in cache and return. 812 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 813 self[b] = res 814 return res 815 816def quote(string, safe='/', encoding=None, errors=None): 817 """quote('abc def') -> 'abc%20def' 818 819 Each part of a URL, e.g. the path info, the query, etc., has a 820 different set of reserved characters that must be quoted. The 821 quote function offers a cautious (not minimal) way to quote a 822 string for most of these parts. 823 824 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists 825 the following (un)reserved characters. 826 827 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 828 reserved = gen-delims / sub-delims 829 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 830 sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 831 / "*" / "+" / "," / ";" / "=" 832 833 Each of the reserved characters is reserved in some component of a URL, 834 but not necessarily in all of them. 835 836 The quote function %-escapes all characters that are neither in the 837 unreserved chars ("always safe") nor the additional chars set via the 838 safe arg. 839 840 The default for the safe arg is '/'. The character is reserved, but in 841 typical usage the quote function is being called on a path where the 842 existing slash characters are to be preserved. 843 844 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. 845 Now, "~" is included in the set of unreserved characters. 846 847 string and safe may be either str or bytes objects. encoding and errors 848 must not be specified if string is a bytes object. 849 850 The optional encoding and errors parameters specify how to deal with 851 non-ASCII characters, as accepted by the str.encode method. 852 By default, encoding='utf-8' (characters are encoded with UTF-8), and 853 errors='strict' (unsupported characters raise a UnicodeEncodeError). 854 """ 855 if isinstance(string, str): 856 if not string: 857 return string 858 if encoding is None: 859 encoding = 'utf-8' 860 if errors is None: 861 errors = 'strict' 862 string = string.encode(encoding, errors) 863 else: 864 if encoding is not None: 865 raise TypeError("quote() doesn't support 'encoding' for bytes") 866 if errors is not None: 867 raise TypeError("quote() doesn't support 'errors' for bytes") 868 return quote_from_bytes(string, safe) 869 870def quote_plus(string, safe='', encoding=None, errors=None): 871 """Like quote(), but also replace ' ' with '+', as required for quoting 872 HTML form values. Plus signs in the original string are escaped unless 873 they are included in safe. It also does not have safe default to '/'. 874 """ 875 # Check if ' ' in string, where string may either be a str or bytes. If 876 # there are no spaces, the regular quote will produce the right answer. 877 if ((isinstance(string, str) and ' ' not in string) or 878 (isinstance(string, bytes) and b' ' not in string)): 879 return quote(string, safe, encoding, errors) 880 if isinstance(safe, str): 881 space = ' ' 882 else: 883 space = b' ' 884 string = quote(string, safe + space, encoding, errors) 885 return string.replace(' ', '+') 886 887def quote_from_bytes(bs, safe='/'): 888 """Like quote(), but accepts a bytes object rather than a str, and does 889 not perform string-to-bytes encoding. It always returns an ASCII string. 890 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 891 """ 892 if not isinstance(bs, (bytes, bytearray)): 893 raise TypeError("quote_from_bytes() expected bytes") 894 if not bs: 895 return '' 896 if isinstance(safe, str): 897 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 898 safe = safe.encode('ascii', 'ignore') 899 else: 900 safe = bytes([c for c in safe if c < 128]) 901 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 902 return bs.decode() 903 try: 904 quoter = _safe_quoters[safe] 905 except KeyError: 906 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 907 return ''.join([quoter(char) for char in bs]) 908 909def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 910 quote_via=quote_plus): 911 """Encode a dict or sequence of two-element tuples into a URL query string. 912 913 If any values in the query arg are sequences and doseq is true, each 914 sequence element is converted to a separate parameter. 915 916 If the query arg is a sequence of two-element tuples, the order of the 917 parameters in the output will match the order of parameters in the 918 input. 919 920 The components of a query arg may each be either a string or a bytes type. 921 922 The safe, encoding, and errors parameters are passed down to the function 923 specified by quote_via (encoding and errors only if a component is a str). 924 """ 925 926 if hasattr(query, "items"): 927 query = query.items() 928 else: 929 # It's a bother at times that strings and string-like objects are 930 # sequences. 931 try: 932 # non-sequence items should not work with len() 933 # non-empty strings will fail this 934 if len(query) and not isinstance(query[0], tuple): 935 raise TypeError 936 # Zero-length sequences of all types will get here and succeed, 937 # but that's a minor nit. Since the original implementation 938 # allowed empty dicts that type of behavior probably should be 939 # preserved for consistency 940 except TypeError: 941 ty, va, tb = sys.exc_info() 942 raise TypeError("not a valid non-string sequence " 943 "or mapping object").with_traceback(tb) 944 945 l = [] 946 if not doseq: 947 for k, v in query: 948 if isinstance(k, bytes): 949 k = quote_via(k, safe) 950 else: 951 k = quote_via(str(k), safe, encoding, errors) 952 953 if isinstance(v, bytes): 954 v = quote_via(v, safe) 955 else: 956 v = quote_via(str(v), safe, encoding, errors) 957 l.append(k + '=' + v) 958 else: 959 for k, v in query: 960 if isinstance(k, bytes): 961 k = quote_via(k, safe) 962 else: 963 k = quote_via(str(k), safe, encoding, errors) 964 965 if isinstance(v, bytes): 966 v = quote_via(v, safe) 967 l.append(k + '=' + v) 968 elif isinstance(v, str): 969 v = quote_via(v, safe, encoding, errors) 970 l.append(k + '=' + v) 971 else: 972 try: 973 # Is this a sufficient test for sequence-ness? 974 x = len(v) 975 except TypeError: 976 # not a sequence 977 v = quote_via(str(v), safe, encoding, errors) 978 l.append(k + '=' + v) 979 else: 980 # loop over the sequence 981 for elt in v: 982 if isinstance(elt, bytes): 983 elt = quote_via(elt, safe) 984 else: 985 elt = quote_via(str(elt), safe, encoding, errors) 986 l.append(k + '=' + elt) 987 return '&'.join(l) 988 989 990def to_bytes(url): 991 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8", 992 DeprecationWarning, stacklevel=2) 993 return _to_bytes(url) 994 995 996def _to_bytes(url): 997 """to_bytes(u"URL") --> 'URL'.""" 998 # Most URL schemes require ASCII. If that changes, the conversion 999 # can be relaxed. 1000 # XXX get rid of to_bytes() 1001 if isinstance(url, str): 1002 try: 1003 url = url.encode("ASCII").decode() 1004 except UnicodeError: 1005 raise UnicodeError("URL " + repr(url) + 1006 " contains non-ASCII characters") 1007 return url 1008 1009 1010def unwrap(url): 1011 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'. 1012 1013 The string is returned unchanged if it's not a wrapped URL. 1014 """ 1015 url = str(url).strip() 1016 if url[:1] == '<' and url[-1:] == '>': 1017 url = url[1:-1].strip() 1018 if url[:4] == 'URL:': 1019 url = url[4:].strip() 1020 return url 1021 1022 1023def splittype(url): 1024 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, " 1025 "use urllib.parse.urlparse() instead", 1026 DeprecationWarning, stacklevel=2) 1027 return _splittype(url) 1028 1029 1030_typeprog = None 1031def _splittype(url): 1032 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 1033 global _typeprog 1034 if _typeprog is None: 1035 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 1036 1037 match = _typeprog.match(url) 1038 if match: 1039 scheme, data = match.groups() 1040 return scheme.lower(), data 1041 return None, url 1042 1043 1044def splithost(url): 1045 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, " 1046 "use urllib.parse.urlparse() instead", 1047 DeprecationWarning, stacklevel=2) 1048 return _splithost(url) 1049 1050 1051_hostprog = None 1052def _splithost(url): 1053 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 1054 global _hostprog 1055 if _hostprog is None: 1056 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 1057 1058 match = _hostprog.match(url) 1059 if match: 1060 host_port, path = match.groups() 1061 if path and path[0] != '/': 1062 path = '/' + path 1063 return host_port, path 1064 return None, url 1065 1066 1067def splituser(host): 1068 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, " 1069 "use urllib.parse.urlparse() instead", 1070 DeprecationWarning, stacklevel=2) 1071 return _splituser(host) 1072 1073 1074def _splituser(host): 1075 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 1076 user, delim, host = host.rpartition('@') 1077 return (user if delim else None), host 1078 1079 1080def splitpasswd(user): 1081 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, " 1082 "use urllib.parse.urlparse() instead", 1083 DeprecationWarning, stacklevel=2) 1084 return _splitpasswd(user) 1085 1086 1087def _splitpasswd(user): 1088 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1089 user, delim, passwd = user.partition(':') 1090 return user, (passwd if delim else None) 1091 1092 1093def splitport(host): 1094 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, " 1095 "use urllib.parse.urlparse() instead", 1096 DeprecationWarning, stacklevel=2) 1097 return _splitport(host) 1098 1099 1100# splittag('/path#tag') --> '/path', 'tag' 1101_portprog = None 1102def _splitport(host): 1103 """splitport('host:port') --> 'host', 'port'.""" 1104 global _portprog 1105 if _portprog is None: 1106 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL) 1107 1108 match = _portprog.fullmatch(host) 1109 if match: 1110 host, port = match.groups() 1111 if port: 1112 return host, port 1113 return host, None 1114 1115 1116def splitnport(host, defport=-1): 1117 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, " 1118 "use urllib.parse.urlparse() instead", 1119 DeprecationWarning, stacklevel=2) 1120 return _splitnport(host, defport) 1121 1122 1123def _splitnport(host, defport=-1): 1124 """Split host and port, returning numeric port. 1125 Return given default port if no ':' found; defaults to -1. 1126 Return numerical port if a valid number are found after ':'. 1127 Return None if ':' but not a valid number.""" 1128 host, delim, port = host.rpartition(':') 1129 if not delim: 1130 host = port 1131 elif port: 1132 try: 1133 nport = int(port) 1134 except ValueError: 1135 nport = None 1136 return host, nport 1137 return host, defport 1138 1139 1140def splitquery(url): 1141 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, " 1142 "use urllib.parse.urlparse() instead", 1143 DeprecationWarning, stacklevel=2) 1144 return _splitquery(url) 1145 1146 1147def _splitquery(url): 1148 """splitquery('/path?query') --> '/path', 'query'.""" 1149 path, delim, query = url.rpartition('?') 1150 if delim: 1151 return path, query 1152 return url, None 1153 1154 1155def splittag(url): 1156 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, " 1157 "use urllib.parse.urlparse() instead", 1158 DeprecationWarning, stacklevel=2) 1159 return _splittag(url) 1160 1161 1162def _splittag(url): 1163 """splittag('/path#tag') --> '/path', 'tag'.""" 1164 path, delim, tag = url.rpartition('#') 1165 if delim: 1166 return path, tag 1167 return url, None 1168 1169 1170def splitattr(url): 1171 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, " 1172 "use urllib.parse.urlparse() instead", 1173 DeprecationWarning, stacklevel=2) 1174 return _splitattr(url) 1175 1176 1177def _splitattr(url): 1178 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1179 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1180 words = url.split(';') 1181 return words[0], words[1:] 1182 1183 1184def splitvalue(attr): 1185 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, " 1186 "use urllib.parse.parse_qsl() instead", 1187 DeprecationWarning, stacklevel=2) 1188 return _splitvalue(attr) 1189 1190 1191def _splitvalue(attr): 1192 """splitvalue('attr=value') --> 'attr', 'value'.""" 1193 attr, delim, value = attr.partition('=') 1194 return attr, (value if delim else None) 1195