1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28""" 29 30import re 31import sys 32import types 33import collections 34import warnings 35 36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 37 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 39 "unquote", "unquote_plus", "unquote_to_bytes", 40 "DefragResult", "ParseResult", "SplitResult", 41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 42 43# A classification of schemes. 44# The empty string classifies URLs with no scheme specified, 45# being the default value returned by “urlsplit” and “urlparse”. 46 47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 48 'wais', 'file', 'https', 'shttp', 'mms', 49 'prospero', 'rtsp', 'rtspu', 'sftp', 50 'svn', 'svn+ssh', 'ws', 'wss'] 51 52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 53 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 54 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 55 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 56 'ws', 'wss'] 57 58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 59 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 60 'mms', 'sftp', 'tel'] 61 62# These are not actually used anymore, but should stay for backwards 63# compatibility. (They are undocumented, but have a public-looking name.) 64 65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 67 68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 70 71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 72 'nntp', 'wais', 'https', 'shttp', 'snews', 73 'file', 'prospero'] 74 75# Characters valid in scheme names 76scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 78 '0123456789' 79 '+-.') 80 81# Unsafe bytes to be removed per WHATWG spec 82_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] 83 84# XXX: Consider replacing with functools.lru_cache 85MAX_CACHE_SIZE = 20 86_parse_cache = {} 87 88def clear_cache(): 89 """Clear the parse cache and the quoters cache.""" 90 _parse_cache.clear() 91 _safe_quoters.clear() 92 93 94# Helpers for bytes handling 95# For 3.2, we deliberately require applications that 96# handle improperly quoted URLs to do their own 97# decoding and encoding. If valid use cases are 98# presented, we may relax this by using latin-1 99# decoding internally for 3.3 100_implicit_encoding = 'ascii' 101_implicit_errors = 'strict' 102 103def _noop(obj): 104 return obj 105 106def _encode_result(obj, encoding=_implicit_encoding, 107 errors=_implicit_errors): 108 return obj.encode(encoding, errors) 109 110def _decode_args(args, encoding=_implicit_encoding, 111 errors=_implicit_errors): 112 return tuple(x.decode(encoding, errors) if x else '' for x in args) 113 114def _coerce_args(*args): 115 # Invokes decode if necessary to create str args 116 # and returns the coerced inputs along with 117 # an appropriate result coercion function 118 # - noop for str inputs 119 # - encoding function otherwise 120 str_input = isinstance(args[0], str) 121 for arg in args[1:]: 122 # We special-case the empty string to support the 123 # "scheme=''" default argument to some functions 124 if arg and isinstance(arg, str) != str_input: 125 raise TypeError("Cannot mix str and non-str arguments") 126 if str_input: 127 return args + (_noop,) 128 return _decode_args(args) + (_encode_result,) 129 130# Result objects are more helpful than simple tuples 131class _ResultMixinStr(object): 132 """Standard approach to encoding parsed results from str to bytes""" 133 __slots__ = () 134 135 def encode(self, encoding='ascii', errors='strict'): 136 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 137 138 139class _ResultMixinBytes(object): 140 """Standard approach to decoding parsed results from bytes to str""" 141 __slots__ = () 142 143 def decode(self, encoding='ascii', errors='strict'): 144 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 145 146 147class _NetlocResultMixinBase(object): 148 """Shared methods for the parsed result objects containing a netloc element""" 149 __slots__ = () 150 151 @property 152 def username(self): 153 return self._userinfo[0] 154 155 @property 156 def password(self): 157 return self._userinfo[1] 158 159 @property 160 def hostname(self): 161 hostname = self._hostinfo[0] 162 if not hostname: 163 return None 164 # Scoped IPv6 address may have zone info, which must not be lowercased 165 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 166 separator = '%' if isinstance(hostname, str) else b'%' 167 hostname, percent, zone = hostname.partition(separator) 168 return hostname.lower() + percent + zone 169 170 @property 171 def port(self): 172 port = self._hostinfo[1] 173 if port is not None: 174 try: 175 port = int(port, 10) 176 except ValueError: 177 message = f'Port could not be cast to integer value as {port!r}' 178 raise ValueError(message) from None 179 if not ( 0 <= port <= 65535): 180 raise ValueError("Port out of range 0-65535") 181 return port 182 183 __class_getitem__ = classmethod(types.GenericAlias) 184 185 186class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 187 __slots__ = () 188 189 @property 190 def _userinfo(self): 191 netloc = self.netloc 192 userinfo, have_info, hostinfo = netloc.rpartition('@') 193 if have_info: 194 username, have_password, password = userinfo.partition(':') 195 if not have_password: 196 password = None 197 else: 198 username = password = None 199 return username, password 200 201 @property 202 def _hostinfo(self): 203 netloc = self.netloc 204 _, _, hostinfo = netloc.rpartition('@') 205 _, have_open_br, bracketed = hostinfo.partition('[') 206 if have_open_br: 207 hostname, _, port = bracketed.partition(']') 208 _, _, port = port.partition(':') 209 else: 210 hostname, _, port = hostinfo.partition(':') 211 if not port: 212 port = None 213 return hostname, port 214 215 216class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 217 __slots__ = () 218 219 @property 220 def _userinfo(self): 221 netloc = self.netloc 222 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 223 if have_info: 224 username, have_password, password = userinfo.partition(b':') 225 if not have_password: 226 password = None 227 else: 228 username = password = None 229 return username, password 230 231 @property 232 def _hostinfo(self): 233 netloc = self.netloc 234 _, _, hostinfo = netloc.rpartition(b'@') 235 _, have_open_br, bracketed = hostinfo.partition(b'[') 236 if have_open_br: 237 hostname, _, port = bracketed.partition(b']') 238 _, _, port = port.partition(b':') 239 else: 240 hostname, _, port = hostinfo.partition(b':') 241 if not port: 242 port = None 243 return hostname, port 244 245 246from collections import namedtuple 247 248_DefragResultBase = namedtuple('DefragResult', 'url fragment') 249_SplitResultBase = namedtuple( 250 'SplitResult', 'scheme netloc path query fragment') 251_ParseResultBase = namedtuple( 252 'ParseResult', 'scheme netloc path params query fragment') 253 254_DefragResultBase.__doc__ = """ 255DefragResult(url, fragment) 256 257A 2-tuple that contains the url without fragment identifier and the fragment 258identifier as a separate argument. 259""" 260 261_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 262 263_DefragResultBase.fragment.__doc__ = """ 264Fragment identifier separated from URL, that allows indirect identification of a 265secondary resource by reference to a primary resource and additional identifying 266information. 267""" 268 269_SplitResultBase.__doc__ = """ 270SplitResult(scheme, netloc, path, query, fragment) 271 272A 5-tuple that contains the different components of a URL. Similar to 273ParseResult, but does not split params. 274""" 275 276_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 277 278_SplitResultBase.netloc.__doc__ = """ 279Network location where the request is made to. 280""" 281 282_SplitResultBase.path.__doc__ = """ 283The hierarchical path, such as the path to a file to download. 284""" 285 286_SplitResultBase.query.__doc__ = """ 287The query component, that contains non-hierarchical data, that along with data 288in path component, identifies a resource in the scope of URI's scheme and 289network location. 290""" 291 292_SplitResultBase.fragment.__doc__ = """ 293Fragment identifier, that allows indirect identification of a secondary resource 294by reference to a primary resource and additional identifying information. 295""" 296 297_ParseResultBase.__doc__ = """ 298ParseResult(scheme, netloc, path, params, query, fragment) 299 300A 6-tuple that contains components of a parsed URL. 301""" 302 303_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 304_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 305_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 306_ParseResultBase.params.__doc__ = """ 307Parameters for last path element used to dereference the URI in order to provide 308access to perform some operation on the resource. 309""" 310 311_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 312_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 313 314 315# For backwards compatibility, alias _NetlocResultMixinStr 316# ResultBase is no longer part of the documented API, but it is 317# retained since deprecating it isn't worth the hassle 318ResultBase = _NetlocResultMixinStr 319 320# Structured result objects for string data 321class DefragResult(_DefragResultBase, _ResultMixinStr): 322 __slots__ = () 323 def geturl(self): 324 if self.fragment: 325 return self.url + '#' + self.fragment 326 else: 327 return self.url 328 329class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 330 __slots__ = () 331 def geturl(self): 332 return urlunsplit(self) 333 334class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 335 __slots__ = () 336 def geturl(self): 337 return urlunparse(self) 338 339# Structured result objects for bytes data 340class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 341 __slots__ = () 342 def geturl(self): 343 if self.fragment: 344 return self.url + b'#' + self.fragment 345 else: 346 return self.url 347 348class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 349 __slots__ = () 350 def geturl(self): 351 return urlunsplit(self) 352 353class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 354 __slots__ = () 355 def geturl(self): 356 return urlunparse(self) 357 358# Set up the encode/decode result pairs 359def _fix_result_transcoding(): 360 _result_pairs = ( 361 (DefragResult, DefragResultBytes), 362 (SplitResult, SplitResultBytes), 363 (ParseResult, ParseResultBytes), 364 ) 365 for _decoded, _encoded in _result_pairs: 366 _decoded._encoded_counterpart = _encoded 367 _encoded._decoded_counterpart = _decoded 368 369_fix_result_transcoding() 370del _fix_result_transcoding 371 372def urlparse(url, scheme='', allow_fragments=True): 373 """Parse a URL into 6 components: 374 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 375 376 The result is a named 6-tuple with fields corresponding to the 377 above. It is either a ParseResult or ParseResultBytes object, 378 depending on the type of the url parameter. 379 380 The username, password, hostname, and port sub-components of netloc 381 can also be accessed as attributes of the returned object. 382 383 The scheme argument provides the default value of the scheme 384 component when no scheme is found in url. 385 386 If allow_fragments is False, no attempt is made to separate the 387 fragment component from the previous component, which can be either 388 path or query. 389 390 Note that % escapes are not expanded. 391 """ 392 url, scheme, _coerce_result = _coerce_args(url, scheme) 393 splitresult = urlsplit(url, scheme, allow_fragments) 394 scheme, netloc, url, query, fragment = splitresult 395 if scheme in uses_params and ';' in url: 396 url, params = _splitparams(url) 397 else: 398 params = '' 399 result = ParseResult(scheme, netloc, url, params, query, fragment) 400 return _coerce_result(result) 401 402def _splitparams(url): 403 if '/' in url: 404 i = url.find(';', url.rfind('/')) 405 if i < 0: 406 return url, '' 407 else: 408 i = url.find(';') 409 return url[:i], url[i+1:] 410 411def _splitnetloc(url, start=0): 412 delim = len(url) # position of end of domain part of url, default is end 413 for c in '/?#': # look for delimiters; the order is NOT important 414 wdelim = url.find(c, start) # find first of this delim 415 if wdelim >= 0: # if found 416 delim = min(delim, wdelim) # use earliest delim position 417 return url[start:delim], url[delim:] # return (domain, rest) 418 419def _checknetloc(netloc): 420 if not netloc or netloc.isascii(): 421 return 422 # looking for characters like \u2100 that expand to 'a/c' 423 # IDNA uses NFKC equivalence, so normalize for this check 424 import unicodedata 425 n = netloc.replace('@', '') # ignore characters already included 426 n = n.replace(':', '') # but not the surrounding text 427 n = n.replace('#', '') 428 n = n.replace('?', '') 429 netloc2 = unicodedata.normalize('NFKC', n) 430 if n == netloc2: 431 return 432 for c in '/?#@:': 433 if c in netloc2: 434 raise ValueError("netloc '" + netloc + "' contains invalid " + 435 "characters under NFKC normalization") 436 437def urlsplit(url, scheme='', allow_fragments=True): 438 """Parse a URL into 5 components: 439 <scheme>://<netloc>/<path>?<query>#<fragment> 440 441 The result is a named 5-tuple with fields corresponding to the 442 above. It is either a SplitResult or SplitResultBytes object, 443 depending on the type of the url parameter. 444 445 The username, password, hostname, and port sub-components of netloc 446 can also be accessed as attributes of the returned object. 447 448 The scheme argument provides the default value of the scheme 449 component when no scheme is found in url. 450 451 If allow_fragments is False, no attempt is made to separate the 452 fragment component from the previous component, which can be either 453 path or query. 454 455 Note that % escapes are not expanded. 456 """ 457 458 url, scheme, _coerce_result = _coerce_args(url, scheme) 459 460 for b in _UNSAFE_URL_BYTES_TO_REMOVE: 461 url = url.replace(b, "") 462 scheme = scheme.replace(b, "") 463 464 allow_fragments = bool(allow_fragments) 465 key = url, scheme, allow_fragments, type(url), type(scheme) 466 cached = _parse_cache.get(key, None) 467 if cached: 468 return _coerce_result(cached) 469 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 470 clear_cache() 471 netloc = query = fragment = '' 472 i = url.find(':') 473 if i > 0: 474 for c in url[:i]: 475 if c not in scheme_chars: 476 break 477 else: 478 scheme, url = url[:i].lower(), url[i+1:] 479 480 if url[:2] == '//': 481 netloc, url = _splitnetloc(url, 2) 482 if (('[' in netloc and ']' not in netloc) or 483 (']' in netloc and '[' not in netloc)): 484 raise ValueError("Invalid IPv6 URL") 485 if allow_fragments and '#' in url: 486 url, fragment = url.split('#', 1) 487 if '?' in url: 488 url, query = url.split('?', 1) 489 _checknetloc(netloc) 490 v = SplitResult(scheme, netloc, url, query, fragment) 491 _parse_cache[key] = v 492 return _coerce_result(v) 493 494def urlunparse(components): 495 """Put a parsed URL back together again. This may result in a 496 slightly different, but equivalent URL, if the URL that was parsed 497 originally had redundant delimiters, e.g. a ? with an empty query 498 (the draft states that these are equivalent).""" 499 scheme, netloc, url, params, query, fragment, _coerce_result = ( 500 _coerce_args(*components)) 501 if params: 502 url = "%s;%s" % (url, params) 503 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 504 505def urlunsplit(components): 506 """Combine the elements of a tuple as returned by urlsplit() into a 507 complete URL as a string. The data argument can be any five-item iterable. 508 This may result in a slightly different, but equivalent URL, if the URL that 509 was parsed originally had unnecessary delimiters (for example, a ? with an 510 empty query; the RFC states that these are equivalent).""" 511 scheme, netloc, url, query, fragment, _coerce_result = ( 512 _coerce_args(*components)) 513 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 514 if url and url[:1] != '/': url = '/' + url 515 url = '//' + (netloc or '') + url 516 if scheme: 517 url = scheme + ':' + url 518 if query: 519 url = url + '?' + query 520 if fragment: 521 url = url + '#' + fragment 522 return _coerce_result(url) 523 524def urljoin(base, url, allow_fragments=True): 525 """Join a base URL and a possibly relative URL to form an absolute 526 interpretation of the latter.""" 527 if not base: 528 return url 529 if not url: 530 return base 531 532 base, url, _coerce_result = _coerce_args(base, url) 533 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 534 urlparse(base, '', allow_fragments) 535 scheme, netloc, path, params, query, fragment = \ 536 urlparse(url, bscheme, allow_fragments) 537 538 if scheme != bscheme or scheme not in uses_relative: 539 return _coerce_result(url) 540 if scheme in uses_netloc: 541 if netloc: 542 return _coerce_result(urlunparse((scheme, netloc, path, 543 params, query, fragment))) 544 netloc = bnetloc 545 546 if not path and not params: 547 path = bpath 548 params = bparams 549 if not query: 550 query = bquery 551 return _coerce_result(urlunparse((scheme, netloc, path, 552 params, query, fragment))) 553 554 base_parts = bpath.split('/') 555 if base_parts[-1] != '': 556 # the last item is not a directory, so will not be taken into account 557 # in resolving the relative path 558 del base_parts[-1] 559 560 # for rfc3986, ignore all base path should the first character be root. 561 if path[:1] == '/': 562 segments = path.split('/') 563 else: 564 segments = base_parts + path.split('/') 565 # filter out elements that would cause redundant slashes on re-joining 566 # the resolved_path 567 segments[1:-1] = filter(None, segments[1:-1]) 568 569 resolved_path = [] 570 571 for seg in segments: 572 if seg == '..': 573 try: 574 resolved_path.pop() 575 except IndexError: 576 # ignore any .. segments that would otherwise cause an IndexError 577 # when popped from resolved_path if resolving for rfc3986 578 pass 579 elif seg == '.': 580 continue 581 else: 582 resolved_path.append(seg) 583 584 if segments[-1] in ('.', '..'): 585 # do some post-processing here. if the last segment was a relative dir, 586 # then we need to append the trailing '/' 587 resolved_path.append('') 588 589 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 590 resolved_path) or '/', params, query, fragment))) 591 592 593def urldefrag(url): 594 """Removes any existing fragment from URL. 595 596 Returns a tuple of the defragmented URL and the fragment. If 597 the URL contained no fragments, the second element is the 598 empty string. 599 """ 600 url, _coerce_result = _coerce_args(url) 601 if '#' in url: 602 s, n, p, a, q, frag = urlparse(url) 603 defrag = urlunparse((s, n, p, a, q, '')) 604 else: 605 frag = '' 606 defrag = url 607 return _coerce_result(DefragResult(defrag, frag)) 608 609_hexdig = '0123456789ABCDEFabcdef' 610_hextobyte = None 611 612def unquote_to_bytes(string): 613 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 614 # Note: strings are encoded as UTF-8. This is only an issue if it contains 615 # unescaped non-ASCII characters, which URIs should not. 616 if not string: 617 # Is it a string-like object? 618 string.split 619 return b'' 620 if isinstance(string, str): 621 string = string.encode('utf-8') 622 bits = string.split(b'%') 623 if len(bits) == 1: 624 return string 625 res = [bits[0]] 626 append = res.append 627 # Delay the initialization of the table to not waste memory 628 # if the function is never called 629 global _hextobyte 630 if _hextobyte is None: 631 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) 632 for a in _hexdig for b in _hexdig} 633 for item in bits[1:]: 634 try: 635 append(_hextobyte[item[:2]]) 636 append(item[2:]) 637 except KeyError: 638 append(b'%') 639 append(item) 640 return b''.join(res) 641 642_asciire = re.compile('([\x00-\x7f]+)') 643 644def unquote(string, encoding='utf-8', errors='replace'): 645 """Replace %xx escapes by their single-character equivalent. The optional 646 encoding and errors parameters specify how to decode percent-encoded 647 sequences into Unicode characters, as accepted by the bytes.decode() 648 method. 649 By default, percent-encoded sequences are decoded with UTF-8, and invalid 650 sequences are replaced by a placeholder character. 651 652 unquote('abc%20def') -> 'abc def'. 653 """ 654 if isinstance(string, bytes): 655 return unquote_to_bytes(string).decode(encoding, errors) 656 if '%' not in string: 657 string.split 658 return string 659 if encoding is None: 660 encoding = 'utf-8' 661 if errors is None: 662 errors = 'replace' 663 bits = _asciire.split(string) 664 res = [bits[0]] 665 append = res.append 666 for i in range(1, len(bits), 2): 667 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 668 append(bits[i + 1]) 669 return ''.join(res) 670 671 672def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 673 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 674 """Parse a query given as a string argument. 675 676 Arguments: 677 678 qs: percent-encoded query string to be parsed 679 680 keep_blank_values: flag indicating whether blank values in 681 percent-encoded queries should be treated as blank strings. 682 A true value indicates that blanks should be retained as 683 blank strings. The default false value indicates that 684 blank values are to be ignored and treated as if they were 685 not included. 686 687 strict_parsing: flag indicating what to do with parsing errors. 688 If false (the default), errors are silently ignored. 689 If true, errors raise a ValueError exception. 690 691 encoding and errors: specify how to decode percent-encoded sequences 692 into Unicode characters, as accepted by the bytes.decode() method. 693 694 max_num_fields: int. If set, then throws a ValueError if there 695 are more than n fields read by parse_qsl(). 696 697 separator: str. The symbol to use for separating the query arguments. 698 Defaults to &. 699 700 Returns a dictionary. 701 """ 702 parsed_result = {} 703 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 704 encoding=encoding, errors=errors, 705 max_num_fields=max_num_fields, separator=separator) 706 for name, value in pairs: 707 if name in parsed_result: 708 parsed_result[name].append(value) 709 else: 710 parsed_result[name] = [value] 711 return parsed_result 712 713 714def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 715 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'): 716 """Parse a query given as a string argument. 717 718 Arguments: 719 720 qs: percent-encoded query string to be parsed 721 722 keep_blank_values: flag indicating whether blank values in 723 percent-encoded queries should be treated as blank strings. 724 A true value indicates that blanks should be retained as blank 725 strings. The default false value indicates that blank values 726 are to be ignored and treated as if they were not included. 727 728 strict_parsing: flag indicating what to do with parsing errors. If 729 false (the default), errors are silently ignored. If true, 730 errors raise a ValueError exception. 731 732 encoding and errors: specify how to decode percent-encoded sequences 733 into Unicode characters, as accepted by the bytes.decode() method. 734 735 max_num_fields: int. If set, then throws a ValueError 736 if there are more than n fields read by parse_qsl(). 737 738 separator: str. The symbol to use for separating the query arguments. 739 Defaults to &. 740 741 Returns a list, as G-d intended. 742 """ 743 qs, _coerce_result = _coerce_args(qs) 744 separator, _ = _coerce_args(separator) 745 746 if not separator or (not isinstance(separator, (str, bytes))): 747 raise ValueError("Separator must be of type string or bytes.") 748 749 # If max_num_fields is defined then check that the number of fields 750 # is less than max_num_fields. This prevents a memory exhaustion DOS 751 # attack via post bodies with many fields. 752 if max_num_fields is not None: 753 num_fields = 1 + qs.count(separator) 754 if max_num_fields < num_fields: 755 raise ValueError('Max number of fields exceeded') 756 757 r = [] 758 for name_value in qs.split(separator): 759 if not name_value and not strict_parsing: 760 continue 761 nv = name_value.split('=', 1) 762 if len(nv) != 2: 763 if strict_parsing: 764 raise ValueError("bad query field: %r" % (name_value,)) 765 # Handle case of a control-name with no equal sign 766 if keep_blank_values: 767 nv.append('') 768 else: 769 continue 770 if len(nv[1]) or keep_blank_values: 771 name = nv[0].replace('+', ' ') 772 name = unquote(name, encoding=encoding, errors=errors) 773 name = _coerce_result(name) 774 value = nv[1].replace('+', ' ') 775 value = unquote(value, encoding=encoding, errors=errors) 776 value = _coerce_result(value) 777 r.append((name, value)) 778 return r 779 780def unquote_plus(string, encoding='utf-8', errors='replace'): 781 """Like unquote(), but also replace plus signs by spaces, as required for 782 unquoting HTML form values. 783 784 unquote_plus('%7e/abc+def') -> '~/abc def' 785 """ 786 string = string.replace('+', ' ') 787 return unquote(string, encoding, errors) 788 789_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 790 b'abcdefghijklmnopqrstuvwxyz' 791 b'0123456789' 792 b'_.-~') 793_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 794_safe_quoters = {} 795 796class Quoter(collections.defaultdict): 797 """A mapping from bytes (in range(0,256)) to strings. 798 799 String values are percent-encoded byte values, unless the key < 128, and 800 in the "safe" set (either the specified safe set, or default set). 801 """ 802 # Keeps a cache internally, using defaultdict, for efficiency (lookups 803 # of cached keys don't call Python code at all). 804 def __init__(self, safe): 805 """safe: bytes object.""" 806 self.safe = _ALWAYS_SAFE.union(safe) 807 808 def __repr__(self): 809 # Without this, will just display as a defaultdict 810 return "<%s %r>" % (self.__class__.__name__, dict(self)) 811 812 def __missing__(self, b): 813 # Handle a cache miss. Store quoted string in cache and return. 814 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 815 self[b] = res 816 return res 817 818def quote(string, safe='/', encoding=None, errors=None): 819 """quote('abc def') -> 'abc%20def' 820 821 Each part of a URL, e.g. the path info, the query, etc., has a 822 different set of reserved characters that must be quoted. The 823 quote function offers a cautious (not minimal) way to quote a 824 string for most of these parts. 825 826 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists 827 the following (un)reserved characters. 828 829 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 830 reserved = gen-delims / sub-delims 831 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 832 sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 833 / "*" / "+" / "," / ";" / "=" 834 835 Each of the reserved characters is reserved in some component of a URL, 836 but not necessarily in all of them. 837 838 The quote function %-escapes all characters that are neither in the 839 unreserved chars ("always safe") nor the additional chars set via the 840 safe arg. 841 842 The default for the safe arg is '/'. The character is reserved, but in 843 typical usage the quote function is being called on a path where the 844 existing slash characters are to be preserved. 845 846 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. 847 Now, "~" is included in the set of unreserved characters. 848 849 string and safe may be either str or bytes objects. encoding and errors 850 must not be specified if string is a bytes object. 851 852 The optional encoding and errors parameters specify how to deal with 853 non-ASCII characters, as accepted by the str.encode method. 854 By default, encoding='utf-8' (characters are encoded with UTF-8), and 855 errors='strict' (unsupported characters raise a UnicodeEncodeError). 856 """ 857 if isinstance(string, str): 858 if not string: 859 return string 860 if encoding is None: 861 encoding = 'utf-8' 862 if errors is None: 863 errors = 'strict' 864 string = string.encode(encoding, errors) 865 else: 866 if encoding is not None: 867 raise TypeError("quote() doesn't support 'encoding' for bytes") 868 if errors is not None: 869 raise TypeError("quote() doesn't support 'errors' for bytes") 870 return quote_from_bytes(string, safe) 871 872def quote_plus(string, safe='', encoding=None, errors=None): 873 """Like quote(), but also replace ' ' with '+', as required for quoting 874 HTML form values. Plus signs in the original string are escaped unless 875 they are included in safe. It also does not have safe default to '/'. 876 """ 877 # Check if ' ' in string, where string may either be a str or bytes. If 878 # there are no spaces, the regular quote will produce the right answer. 879 if ((isinstance(string, str) and ' ' not in string) or 880 (isinstance(string, bytes) and b' ' not in string)): 881 return quote(string, safe, encoding, errors) 882 if isinstance(safe, str): 883 space = ' ' 884 else: 885 space = b' ' 886 string = quote(string, safe + space, encoding, errors) 887 return string.replace(' ', '+') 888 889def quote_from_bytes(bs, safe='/'): 890 """Like quote(), but accepts a bytes object rather than a str, and does 891 not perform string-to-bytes encoding. It always returns an ASCII string. 892 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 893 """ 894 if not isinstance(bs, (bytes, bytearray)): 895 raise TypeError("quote_from_bytes() expected bytes") 896 if not bs: 897 return '' 898 if isinstance(safe, str): 899 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 900 safe = safe.encode('ascii', 'ignore') 901 else: 902 safe = bytes([c for c in safe if c < 128]) 903 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 904 return bs.decode() 905 try: 906 quoter = _safe_quoters[safe] 907 except KeyError: 908 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 909 return ''.join([quoter(char) for char in bs]) 910 911def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 912 quote_via=quote_plus): 913 """Encode a dict or sequence of two-element tuples into a URL query string. 914 915 If any values in the query arg are sequences and doseq is true, each 916 sequence element is converted to a separate parameter. 917 918 If the query arg is a sequence of two-element tuples, the order of the 919 parameters in the output will match the order of parameters in the 920 input. 921 922 The components of a query arg may each be either a string or a bytes type. 923 924 The safe, encoding, and errors parameters are passed down to the function 925 specified by quote_via (encoding and errors only if a component is a str). 926 """ 927 928 if hasattr(query, "items"): 929 query = query.items() 930 else: 931 # It's a bother at times that strings and string-like objects are 932 # sequences. 933 try: 934 # non-sequence items should not work with len() 935 # non-empty strings will fail this 936 if len(query) and not isinstance(query[0], tuple): 937 raise TypeError 938 # Zero-length sequences of all types will get here and succeed, 939 # but that's a minor nit. Since the original implementation 940 # allowed empty dicts that type of behavior probably should be 941 # preserved for consistency 942 except TypeError: 943 ty, va, tb = sys.exc_info() 944 raise TypeError("not a valid non-string sequence " 945 "or mapping object").with_traceback(tb) 946 947 l = [] 948 if not doseq: 949 for k, v in query: 950 if isinstance(k, bytes): 951 k = quote_via(k, safe) 952 else: 953 k = quote_via(str(k), safe, encoding, errors) 954 955 if isinstance(v, bytes): 956 v = quote_via(v, safe) 957 else: 958 v = quote_via(str(v), safe, encoding, errors) 959 l.append(k + '=' + v) 960 else: 961 for k, v in query: 962 if isinstance(k, bytes): 963 k = quote_via(k, safe) 964 else: 965 k = quote_via(str(k), safe, encoding, errors) 966 967 if isinstance(v, bytes): 968 v = quote_via(v, safe) 969 l.append(k + '=' + v) 970 elif isinstance(v, str): 971 v = quote_via(v, safe, encoding, errors) 972 l.append(k + '=' + v) 973 else: 974 try: 975 # Is this a sufficient test for sequence-ness? 976 x = len(v) 977 except TypeError: 978 # not a sequence 979 v = quote_via(str(v), safe, encoding, errors) 980 l.append(k + '=' + v) 981 else: 982 # loop over the sequence 983 for elt in v: 984 if isinstance(elt, bytes): 985 elt = quote_via(elt, safe) 986 else: 987 elt = quote_via(str(elt), safe, encoding, errors) 988 l.append(k + '=' + elt) 989 return '&'.join(l) 990 991 992def to_bytes(url): 993 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8", 994 DeprecationWarning, stacklevel=2) 995 return _to_bytes(url) 996 997 998def _to_bytes(url): 999 """to_bytes(u"URL") --> 'URL'.""" 1000 # Most URL schemes require ASCII. If that changes, the conversion 1001 # can be relaxed. 1002 # XXX get rid of to_bytes() 1003 if isinstance(url, str): 1004 try: 1005 url = url.encode("ASCII").decode() 1006 except UnicodeError: 1007 raise UnicodeError("URL " + repr(url) + 1008 " contains non-ASCII characters") 1009 return url 1010 1011 1012def unwrap(url): 1013 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'. 1014 1015 The string is returned unchanged if it's not a wrapped URL. 1016 """ 1017 url = str(url).strip() 1018 if url[:1] == '<' and url[-1:] == '>': 1019 url = url[1:-1].strip() 1020 if url[:4] == 'URL:': 1021 url = url[4:].strip() 1022 return url 1023 1024 1025def splittype(url): 1026 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, " 1027 "use urllib.parse.urlparse() instead", 1028 DeprecationWarning, stacklevel=2) 1029 return _splittype(url) 1030 1031 1032_typeprog = None 1033def _splittype(url): 1034 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 1035 global _typeprog 1036 if _typeprog is None: 1037 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 1038 1039 match = _typeprog.match(url) 1040 if match: 1041 scheme, data = match.groups() 1042 return scheme.lower(), data 1043 return None, url 1044 1045 1046def splithost(url): 1047 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, " 1048 "use urllib.parse.urlparse() instead", 1049 DeprecationWarning, stacklevel=2) 1050 return _splithost(url) 1051 1052 1053_hostprog = None 1054def _splithost(url): 1055 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 1056 global _hostprog 1057 if _hostprog is None: 1058 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 1059 1060 match = _hostprog.match(url) 1061 if match: 1062 host_port, path = match.groups() 1063 if path and path[0] != '/': 1064 path = '/' + path 1065 return host_port, path 1066 return None, url 1067 1068 1069def splituser(host): 1070 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, " 1071 "use urllib.parse.urlparse() instead", 1072 DeprecationWarning, stacklevel=2) 1073 return _splituser(host) 1074 1075 1076def _splituser(host): 1077 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 1078 user, delim, host = host.rpartition('@') 1079 return (user if delim else None), host 1080 1081 1082def splitpasswd(user): 1083 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, " 1084 "use urllib.parse.urlparse() instead", 1085 DeprecationWarning, stacklevel=2) 1086 return _splitpasswd(user) 1087 1088 1089def _splitpasswd(user): 1090 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1091 user, delim, passwd = user.partition(':') 1092 return user, (passwd if delim else None) 1093 1094 1095def splitport(host): 1096 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, " 1097 "use urllib.parse.urlparse() instead", 1098 DeprecationWarning, stacklevel=2) 1099 return _splitport(host) 1100 1101 1102# splittag('/path#tag') --> '/path', 'tag' 1103_portprog = None 1104def _splitport(host): 1105 """splitport('host:port') --> 'host', 'port'.""" 1106 global _portprog 1107 if _portprog is None: 1108 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL) 1109 1110 match = _portprog.fullmatch(host) 1111 if match: 1112 host, port = match.groups() 1113 if port: 1114 return host, port 1115 return host, None 1116 1117 1118def splitnport(host, defport=-1): 1119 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, " 1120 "use urllib.parse.urlparse() instead", 1121 DeprecationWarning, stacklevel=2) 1122 return _splitnport(host, defport) 1123 1124 1125def _splitnport(host, defport=-1): 1126 """Split host and port, returning numeric port. 1127 Return given default port if no ':' found; defaults to -1. 1128 Return numerical port if a valid number are found after ':'. 1129 Return None if ':' but not a valid number.""" 1130 host, delim, port = host.rpartition(':') 1131 if not delim: 1132 host = port 1133 elif port: 1134 try: 1135 nport = int(port) 1136 except ValueError: 1137 nport = None 1138 return host, nport 1139 return host, defport 1140 1141 1142def splitquery(url): 1143 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, " 1144 "use urllib.parse.urlparse() instead", 1145 DeprecationWarning, stacklevel=2) 1146 return _splitquery(url) 1147 1148 1149def _splitquery(url): 1150 """splitquery('/path?query') --> '/path', 'query'.""" 1151 path, delim, query = url.rpartition('?') 1152 if delim: 1153 return path, query 1154 return url, None 1155 1156 1157def splittag(url): 1158 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, " 1159 "use urllib.parse.urlparse() instead", 1160 DeprecationWarning, stacklevel=2) 1161 return _splittag(url) 1162 1163 1164def _splittag(url): 1165 """splittag('/path#tag') --> '/path', 'tag'.""" 1166 path, delim, tag = url.rpartition('#') 1167 if delim: 1168 return path, tag 1169 return url, None 1170 1171 1172def splitattr(url): 1173 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, " 1174 "use urllib.parse.urlparse() instead", 1175 DeprecationWarning, stacklevel=2) 1176 return _splitattr(url) 1177 1178 1179def _splitattr(url): 1180 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1181 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1182 words = url.split(';') 1183 return words[0], words[1:] 1184 1185 1186def splitvalue(attr): 1187 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, " 1188 "use urllib.parse.parse_qsl() instead", 1189 DeprecationWarning, stacklevel=2) 1190 return _splitvalue(attr) 1191 1192 1193def _splitvalue(attr): 1194 """splitvalue('attr=value') --> 'attr', 'value'.""" 1195 attr, delim, value = attr.partition('=') 1196 return attr, (value if delim else None) 1197