1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28""" 29 30import re 31import sys 32import types 33import collections 34import warnings 35 36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 37 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 39 "unquote", "unquote_plus", "unquote_to_bytes", 40 "DefragResult", "ParseResult", "SplitResult", 41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 42 43# A classification of schemes. 44# The empty string classifies URLs with no scheme specified, 45# being the default value returned by “urlsplit” and “urlparse”. 46 47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 48 'wais', 'file', 'https', 'shttp', 'mms', 49 'prospero', 'rtsp', 'rtspu', 'sftp', 50 'svn', 'svn+ssh', 'ws', 'wss'] 51 52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 53 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 54 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 55 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 56 'ws', 'wss'] 57 58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 59 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 60 'mms', 'sftp', 'tel'] 61 62# These are not actually used anymore, but should stay for backwards 63# compatibility. (They are undocumented, but have a public-looking name.) 64 65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 67 68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 70 71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 72 'nntp', 'wais', 'https', 'shttp', 'snews', 73 'file', 'prospero'] 74 75# Characters valid in scheme names 76scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 78 '0123456789' 79 '+-.') 80 81# XXX: Consider replacing with functools.lru_cache 82MAX_CACHE_SIZE = 20 83_parse_cache = {} 84 85def clear_cache(): 86 """Clear the parse cache and the quoters cache.""" 87 _parse_cache.clear() 88 _safe_quoters.clear() 89 90 91# Helpers for bytes handling 92# For 3.2, we deliberately require applications that 93# handle improperly quoted URLs to do their own 94# decoding and encoding. If valid use cases are 95# presented, we may relax this by using latin-1 96# decoding internally for 3.3 97_implicit_encoding = 'ascii' 98_implicit_errors = 'strict' 99 100def _noop(obj): 101 return obj 102 103def _encode_result(obj, encoding=_implicit_encoding, 104 errors=_implicit_errors): 105 return obj.encode(encoding, errors) 106 107def _decode_args(args, encoding=_implicit_encoding, 108 errors=_implicit_errors): 109 return tuple(x.decode(encoding, errors) if x else '' for x in args) 110 111def _coerce_args(*args): 112 # Invokes decode if necessary to create str args 113 # and returns the coerced inputs along with 114 # an appropriate result coercion function 115 # - noop for str inputs 116 # - encoding function otherwise 117 str_input = isinstance(args[0], str) 118 for arg in args[1:]: 119 # We special-case the empty string to support the 120 # "scheme=''" default argument to some functions 121 if arg and isinstance(arg, str) != str_input: 122 raise TypeError("Cannot mix str and non-str arguments") 123 if str_input: 124 return args + (_noop,) 125 return _decode_args(args) + (_encode_result,) 126 127# Result objects are more helpful than simple tuples 128class _ResultMixinStr(object): 129 """Standard approach to encoding parsed results from str to bytes""" 130 __slots__ = () 131 132 def encode(self, encoding='ascii', errors='strict'): 133 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 134 135 136class _ResultMixinBytes(object): 137 """Standard approach to decoding parsed results from bytes to str""" 138 __slots__ = () 139 140 def decode(self, encoding='ascii', errors='strict'): 141 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 142 143 144class _NetlocResultMixinBase(object): 145 """Shared methods for the parsed result objects containing a netloc element""" 146 __slots__ = () 147 148 @property 149 def username(self): 150 return self._userinfo[0] 151 152 @property 153 def password(self): 154 return self._userinfo[1] 155 156 @property 157 def hostname(self): 158 hostname = self._hostinfo[0] 159 if not hostname: 160 return None 161 # Scoped IPv6 address may have zone info, which must not be lowercased 162 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 163 separator = '%' if isinstance(hostname, str) else b'%' 164 hostname, percent, zone = hostname.partition(separator) 165 return hostname.lower() + percent + zone 166 167 @property 168 def port(self): 169 port = self._hostinfo[1] 170 if port is not None: 171 try: 172 port = int(port, 10) 173 except ValueError: 174 message = f'Port could not be cast to integer value as {port!r}' 175 raise ValueError(message) from None 176 if not ( 0 <= port <= 65535): 177 raise ValueError("Port out of range 0-65535") 178 return port 179 180 __class_getitem__ = classmethod(types.GenericAlias) 181 182 183class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 184 __slots__ = () 185 186 @property 187 def _userinfo(self): 188 netloc = self.netloc 189 userinfo, have_info, hostinfo = netloc.rpartition('@') 190 if have_info: 191 username, have_password, password = userinfo.partition(':') 192 if not have_password: 193 password = None 194 else: 195 username = password = None 196 return username, password 197 198 @property 199 def _hostinfo(self): 200 netloc = self.netloc 201 _, _, hostinfo = netloc.rpartition('@') 202 _, have_open_br, bracketed = hostinfo.partition('[') 203 if have_open_br: 204 hostname, _, port = bracketed.partition(']') 205 _, _, port = port.partition(':') 206 else: 207 hostname, _, port = hostinfo.partition(':') 208 if not port: 209 port = None 210 return hostname, port 211 212 213class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 214 __slots__ = () 215 216 @property 217 def _userinfo(self): 218 netloc = self.netloc 219 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 220 if have_info: 221 username, have_password, password = userinfo.partition(b':') 222 if not have_password: 223 password = None 224 else: 225 username = password = None 226 return username, password 227 228 @property 229 def _hostinfo(self): 230 netloc = self.netloc 231 _, _, hostinfo = netloc.rpartition(b'@') 232 _, have_open_br, bracketed = hostinfo.partition(b'[') 233 if have_open_br: 234 hostname, _, port = bracketed.partition(b']') 235 _, _, port = port.partition(b':') 236 else: 237 hostname, _, port = hostinfo.partition(b':') 238 if not port: 239 port = None 240 return hostname, port 241 242 243from collections import namedtuple 244 245_DefragResultBase = namedtuple('DefragResult', 'url fragment') 246_SplitResultBase = namedtuple( 247 'SplitResult', 'scheme netloc path query fragment') 248_ParseResultBase = namedtuple( 249 'ParseResult', 'scheme netloc path params query fragment') 250 251_DefragResultBase.__doc__ = """ 252DefragResult(url, fragment) 253 254A 2-tuple that contains the url without fragment identifier and the fragment 255identifier as a separate argument. 256""" 257 258_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 259 260_DefragResultBase.fragment.__doc__ = """ 261Fragment identifier separated from URL, that allows indirect identification of a 262secondary resource by reference to a primary resource and additional identifying 263information. 264""" 265 266_SplitResultBase.__doc__ = """ 267SplitResult(scheme, netloc, path, query, fragment) 268 269A 5-tuple that contains the different components of a URL. Similar to 270ParseResult, but does not split params. 271""" 272 273_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 274 275_SplitResultBase.netloc.__doc__ = """ 276Network location where the request is made to. 277""" 278 279_SplitResultBase.path.__doc__ = """ 280The hierarchical path, such as the path to a file to download. 281""" 282 283_SplitResultBase.query.__doc__ = """ 284The query component, that contains non-hierarchical data, that along with data 285in path component, identifies a resource in the scope of URI's scheme and 286network location. 287""" 288 289_SplitResultBase.fragment.__doc__ = """ 290Fragment identifier, that allows indirect identification of a secondary resource 291by reference to a primary resource and additional identifying information. 292""" 293 294_ParseResultBase.__doc__ = """ 295ParseResult(scheme, netloc, path, params, query, fragment) 296 297A 6-tuple that contains components of a parsed URL. 298""" 299 300_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 301_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 302_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 303_ParseResultBase.params.__doc__ = """ 304Parameters for last path element used to dereference the URI in order to provide 305access to perform some operation on the resource. 306""" 307 308_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 309_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 310 311 312# For backwards compatibility, alias _NetlocResultMixinStr 313# ResultBase is no longer part of the documented API, but it is 314# retained since deprecating it isn't worth the hassle 315ResultBase = _NetlocResultMixinStr 316 317# Structured result objects for string data 318class DefragResult(_DefragResultBase, _ResultMixinStr): 319 __slots__ = () 320 def geturl(self): 321 if self.fragment: 322 return self.url + '#' + self.fragment 323 else: 324 return self.url 325 326class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 327 __slots__ = () 328 def geturl(self): 329 return urlunsplit(self) 330 331class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 332 __slots__ = () 333 def geturl(self): 334 return urlunparse(self) 335 336# Structured result objects for bytes data 337class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 338 __slots__ = () 339 def geturl(self): 340 if self.fragment: 341 return self.url + b'#' + self.fragment 342 else: 343 return self.url 344 345class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 346 __slots__ = () 347 def geturl(self): 348 return urlunsplit(self) 349 350class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 351 __slots__ = () 352 def geturl(self): 353 return urlunparse(self) 354 355# Set up the encode/decode result pairs 356def _fix_result_transcoding(): 357 _result_pairs = ( 358 (DefragResult, DefragResultBytes), 359 (SplitResult, SplitResultBytes), 360 (ParseResult, ParseResultBytes), 361 ) 362 for _decoded, _encoded in _result_pairs: 363 _decoded._encoded_counterpart = _encoded 364 _encoded._decoded_counterpart = _decoded 365 366_fix_result_transcoding() 367del _fix_result_transcoding 368 369def urlparse(url, scheme='', allow_fragments=True): 370 """Parse a URL into 6 components: 371 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 372 373 The result is a named 6-tuple with fields corresponding to the 374 above. It is either a ParseResult or ParseResultBytes object, 375 depending on the type of the url parameter. 376 377 The username, password, hostname, and port sub-components of netloc 378 can also be accessed as attributes of the returned object. 379 380 The scheme argument provides the default value of the scheme 381 component when no scheme is found in url. 382 383 If allow_fragments is False, no attempt is made to separate the 384 fragment component from the previous component, which can be either 385 path or query. 386 387 Note that % escapes are not expanded. 388 """ 389 url, scheme, _coerce_result = _coerce_args(url, scheme) 390 splitresult = urlsplit(url, scheme, allow_fragments) 391 scheme, netloc, url, query, fragment = splitresult 392 if scheme in uses_params and ';' in url: 393 url, params = _splitparams(url) 394 else: 395 params = '' 396 result = ParseResult(scheme, netloc, url, params, query, fragment) 397 return _coerce_result(result) 398 399def _splitparams(url): 400 if '/' in url: 401 i = url.find(';', url.rfind('/')) 402 if i < 0: 403 return url, '' 404 else: 405 i = url.find(';') 406 return url[:i], url[i+1:] 407 408def _splitnetloc(url, start=0): 409 delim = len(url) # position of end of domain part of url, default is end 410 for c in '/?#': # look for delimiters; the order is NOT important 411 wdelim = url.find(c, start) # find first of this delim 412 if wdelim >= 0: # if found 413 delim = min(delim, wdelim) # use earliest delim position 414 return url[start:delim], url[delim:] # return (domain, rest) 415 416def _checknetloc(netloc): 417 if not netloc or netloc.isascii(): 418 return 419 # looking for characters like \u2100 that expand to 'a/c' 420 # IDNA uses NFKC equivalence, so normalize for this check 421 import unicodedata 422 n = netloc.replace('@', '') # ignore characters already included 423 n = n.replace(':', '') # but not the surrounding text 424 n = n.replace('#', '') 425 n = n.replace('?', '') 426 netloc2 = unicodedata.normalize('NFKC', n) 427 if n == netloc2: 428 return 429 for c in '/?#@:': 430 if c in netloc2: 431 raise ValueError("netloc '" + netloc + "' contains invalid " + 432 "characters under NFKC normalization") 433 434def urlsplit(url, scheme='', allow_fragments=True): 435 """Parse a URL into 5 components: 436 <scheme>://<netloc>/<path>?<query>#<fragment> 437 438 The result is a named 5-tuple with fields corresponding to the 439 above. It is either a SplitResult or SplitResultBytes object, 440 depending on the type of the url parameter. 441 442 The username, password, hostname, and port sub-components of netloc 443 can also be accessed as attributes of the returned object. 444 445 The scheme argument provides the default value of the scheme 446 component when no scheme is found in url. 447 448 If allow_fragments is False, no attempt is made to separate the 449 fragment component from the previous component, which can be either 450 path or query. 451 452 Note that % escapes are not expanded. 453 """ 454 455 url, scheme, _coerce_result = _coerce_args(url, scheme) 456 allow_fragments = bool(allow_fragments) 457 key = url, scheme, allow_fragments, type(url), type(scheme) 458 cached = _parse_cache.get(key, None) 459 if cached: 460 return _coerce_result(cached) 461 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 462 clear_cache() 463 netloc = query = fragment = '' 464 i = url.find(':') 465 if i > 0: 466 for c in url[:i]: 467 if c not in scheme_chars: 468 break 469 else: 470 scheme, url = url[:i].lower(), url[i+1:] 471 472 if url[:2] == '//': 473 netloc, url = _splitnetloc(url, 2) 474 if (('[' in netloc and ']' not in netloc) or 475 (']' in netloc and '[' not in netloc)): 476 raise ValueError("Invalid IPv6 URL") 477 if allow_fragments and '#' in url: 478 url, fragment = url.split('#', 1) 479 if '?' in url: 480 url, query = url.split('?', 1) 481 _checknetloc(netloc) 482 v = SplitResult(scheme, netloc, url, query, fragment) 483 _parse_cache[key] = v 484 return _coerce_result(v) 485 486def urlunparse(components): 487 """Put a parsed URL back together again. This may result in a 488 slightly different, but equivalent URL, if the URL that was parsed 489 originally had redundant delimiters, e.g. a ? with an empty query 490 (the draft states that these are equivalent).""" 491 scheme, netloc, url, params, query, fragment, _coerce_result = ( 492 _coerce_args(*components)) 493 if params: 494 url = "%s;%s" % (url, params) 495 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 496 497def urlunsplit(components): 498 """Combine the elements of a tuple as returned by urlsplit() into a 499 complete URL as a string. The data argument can be any five-item iterable. 500 This may result in a slightly different, but equivalent URL, if the URL that 501 was parsed originally had unnecessary delimiters (for example, a ? with an 502 empty query; the RFC states that these are equivalent).""" 503 scheme, netloc, url, query, fragment, _coerce_result = ( 504 _coerce_args(*components)) 505 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 506 if url and url[:1] != '/': url = '/' + url 507 url = '//' + (netloc or '') + url 508 if scheme: 509 url = scheme + ':' + url 510 if query: 511 url = url + '?' + query 512 if fragment: 513 url = url + '#' + fragment 514 return _coerce_result(url) 515 516def urljoin(base, url, allow_fragments=True): 517 """Join a base URL and a possibly relative URL to form an absolute 518 interpretation of the latter.""" 519 if not base: 520 return url 521 if not url: 522 return base 523 524 base, url, _coerce_result = _coerce_args(base, url) 525 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 526 urlparse(base, '', allow_fragments) 527 scheme, netloc, path, params, query, fragment = \ 528 urlparse(url, bscheme, allow_fragments) 529 530 if scheme != bscheme or scheme not in uses_relative: 531 return _coerce_result(url) 532 if scheme in uses_netloc: 533 if netloc: 534 return _coerce_result(urlunparse((scheme, netloc, path, 535 params, query, fragment))) 536 netloc = bnetloc 537 538 if not path and not params: 539 path = bpath 540 params = bparams 541 if not query: 542 query = bquery 543 return _coerce_result(urlunparse((scheme, netloc, path, 544 params, query, fragment))) 545 546 base_parts = bpath.split('/') 547 if base_parts[-1] != '': 548 # the last item is not a directory, so will not be taken into account 549 # in resolving the relative path 550 del base_parts[-1] 551 552 # for rfc3986, ignore all base path should the first character be root. 553 if path[:1] == '/': 554 segments = path.split('/') 555 else: 556 segments = base_parts + path.split('/') 557 # filter out elements that would cause redundant slashes on re-joining 558 # the resolved_path 559 segments[1:-1] = filter(None, segments[1:-1]) 560 561 resolved_path = [] 562 563 for seg in segments: 564 if seg == '..': 565 try: 566 resolved_path.pop() 567 except IndexError: 568 # ignore any .. segments that would otherwise cause an IndexError 569 # when popped from resolved_path if resolving for rfc3986 570 pass 571 elif seg == '.': 572 continue 573 else: 574 resolved_path.append(seg) 575 576 if segments[-1] in ('.', '..'): 577 # do some post-processing here. if the last segment was a relative dir, 578 # then we need to append the trailing '/' 579 resolved_path.append('') 580 581 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 582 resolved_path) or '/', params, query, fragment))) 583 584 585def urldefrag(url): 586 """Removes any existing fragment from URL. 587 588 Returns a tuple of the defragmented URL and the fragment. If 589 the URL contained no fragments, the second element is the 590 empty string. 591 """ 592 url, _coerce_result = _coerce_args(url) 593 if '#' in url: 594 s, n, p, a, q, frag = urlparse(url) 595 defrag = urlunparse((s, n, p, a, q, '')) 596 else: 597 frag = '' 598 defrag = url 599 return _coerce_result(DefragResult(defrag, frag)) 600 601_hexdig = '0123456789ABCDEFabcdef' 602_hextobyte = None 603 604def unquote_to_bytes(string): 605 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 606 # Note: strings are encoded as UTF-8. This is only an issue if it contains 607 # unescaped non-ASCII characters, which URIs should not. 608 if not string: 609 # Is it a string-like object? 610 string.split 611 return b'' 612 if isinstance(string, str): 613 string = string.encode('utf-8') 614 bits = string.split(b'%') 615 if len(bits) == 1: 616 return string 617 res = [bits[0]] 618 append = res.append 619 # Delay the initialization of the table to not waste memory 620 # if the function is never called 621 global _hextobyte 622 if _hextobyte is None: 623 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) 624 for a in _hexdig for b in _hexdig} 625 for item in bits[1:]: 626 try: 627 append(_hextobyte[item[:2]]) 628 append(item[2:]) 629 except KeyError: 630 append(b'%') 631 append(item) 632 return b''.join(res) 633 634_asciire = re.compile('([\x00-\x7f]+)') 635 636def unquote(string, encoding='utf-8', errors='replace'): 637 """Replace %xx escapes by their single-character equivalent. The optional 638 encoding and errors parameters specify how to decode percent-encoded 639 sequences into Unicode characters, as accepted by the bytes.decode() 640 method. 641 By default, percent-encoded sequences are decoded with UTF-8, and invalid 642 sequences are replaced by a placeholder character. 643 644 unquote('abc%20def') -> 'abc def'. 645 """ 646 if isinstance(string, bytes): 647 return unquote_to_bytes(string).decode(encoding, errors) 648 if '%' not in string: 649 string.split 650 return string 651 if encoding is None: 652 encoding = 'utf-8' 653 if errors is None: 654 errors = 'replace' 655 bits = _asciire.split(string) 656 res = [bits[0]] 657 append = res.append 658 for i in range(1, len(bits), 2): 659 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 660 append(bits[i + 1]) 661 return ''.join(res) 662 663 664def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 665 encoding='utf-8', errors='replace', max_num_fields=None): 666 """Parse a query given as a string argument. 667 668 Arguments: 669 670 qs: percent-encoded query string to be parsed 671 672 keep_blank_values: flag indicating whether blank values in 673 percent-encoded queries should be treated as blank strings. 674 A true value indicates that blanks should be retained as 675 blank strings. The default false value indicates that 676 blank values are to be ignored and treated as if they were 677 not included. 678 679 strict_parsing: flag indicating what to do with parsing errors. 680 If false (the default), errors are silently ignored. 681 If true, errors raise a ValueError exception. 682 683 encoding and errors: specify how to decode percent-encoded sequences 684 into Unicode characters, as accepted by the bytes.decode() method. 685 686 max_num_fields: int. If set, then throws a ValueError if there 687 are more than n fields read by parse_qsl(). 688 689 Returns a dictionary. 690 """ 691 parsed_result = {} 692 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 693 encoding=encoding, errors=errors, 694 max_num_fields=max_num_fields) 695 for name, value in pairs: 696 if name in parsed_result: 697 parsed_result[name].append(value) 698 else: 699 parsed_result[name] = [value] 700 return parsed_result 701 702 703def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 704 encoding='utf-8', errors='replace', max_num_fields=None): 705 """Parse a query given as a string argument. 706 707 Arguments: 708 709 qs: percent-encoded query string to be parsed 710 711 keep_blank_values: flag indicating whether blank values in 712 percent-encoded queries should be treated as blank strings. 713 A true value indicates that blanks should be retained as blank 714 strings. The default false value indicates that blank values 715 are to be ignored and treated as if they were not included. 716 717 strict_parsing: flag indicating what to do with parsing errors. If 718 false (the default), errors are silently ignored. If true, 719 errors raise a ValueError exception. 720 721 encoding and errors: specify how to decode percent-encoded sequences 722 into Unicode characters, as accepted by the bytes.decode() method. 723 724 max_num_fields: int. If set, then throws a ValueError 725 if there are more than n fields read by parse_qsl(). 726 727 Returns a list, as G-d intended. 728 """ 729 qs, _coerce_result = _coerce_args(qs) 730 731 # If max_num_fields is defined then check that the number of fields 732 # is less than max_num_fields. This prevents a memory exhaustion DOS 733 # attack via post bodies with many fields. 734 if max_num_fields is not None: 735 num_fields = 1 + qs.count('&') + qs.count(';') 736 if max_num_fields < num_fields: 737 raise ValueError('Max number of fields exceeded') 738 739 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 740 r = [] 741 for name_value in pairs: 742 if not name_value and not strict_parsing: 743 continue 744 nv = name_value.split('=', 1) 745 if len(nv) != 2: 746 if strict_parsing: 747 raise ValueError("bad query field: %r" % (name_value,)) 748 # Handle case of a control-name with no equal sign 749 if keep_blank_values: 750 nv.append('') 751 else: 752 continue 753 if len(nv[1]) or keep_blank_values: 754 name = nv[0].replace('+', ' ') 755 name = unquote(name, encoding=encoding, errors=errors) 756 name = _coerce_result(name) 757 value = nv[1].replace('+', ' ') 758 value = unquote(value, encoding=encoding, errors=errors) 759 value = _coerce_result(value) 760 r.append((name, value)) 761 return r 762 763def unquote_plus(string, encoding='utf-8', errors='replace'): 764 """Like unquote(), but also replace plus signs by spaces, as required for 765 unquoting HTML form values. 766 767 unquote_plus('%7e/abc+def') -> '~/abc def' 768 """ 769 string = string.replace('+', ' ') 770 return unquote(string, encoding, errors) 771 772_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 773 b'abcdefghijklmnopqrstuvwxyz' 774 b'0123456789' 775 b'_.-~') 776_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 777_safe_quoters = {} 778 779class Quoter(collections.defaultdict): 780 """A mapping from bytes (in range(0,256)) to strings. 781 782 String values are percent-encoded byte values, unless the key < 128, and 783 in the "safe" set (either the specified safe set, or default set). 784 """ 785 # Keeps a cache internally, using defaultdict, for efficiency (lookups 786 # of cached keys don't call Python code at all). 787 def __init__(self, safe): 788 """safe: bytes object.""" 789 self.safe = _ALWAYS_SAFE.union(safe) 790 791 def __repr__(self): 792 # Without this, will just display as a defaultdict 793 return "<%s %r>" % (self.__class__.__name__, dict(self)) 794 795 def __missing__(self, b): 796 # Handle a cache miss. Store quoted string in cache and return. 797 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 798 self[b] = res 799 return res 800 801def quote(string, safe='/', encoding=None, errors=None): 802 """quote('abc def') -> 'abc%20def' 803 804 Each part of a URL, e.g. the path info, the query, etc., has a 805 different set of reserved characters that must be quoted. The 806 quote function offers a cautious (not minimal) way to quote a 807 string for most of these parts. 808 809 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists 810 the following (un)reserved characters. 811 812 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 813 reserved = gen-delims / sub-delims 814 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 815 sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 816 / "*" / "+" / "," / ";" / "=" 817 818 Each of the reserved characters is reserved in some component of a URL, 819 but not necessarily in all of them. 820 821 The quote function %-escapes all characters that are neither in the 822 unreserved chars ("always safe") nor the additional chars set via the 823 safe arg. 824 825 The default for the safe arg is '/'. The character is reserved, but in 826 typical usage the quote function is being called on a path where the 827 existing slash characters are to be preserved. 828 829 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. 830 Now, "~" is included in the set of unreserved characters. 831 832 string and safe may be either str or bytes objects. encoding and errors 833 must not be specified if string is a bytes object. 834 835 The optional encoding and errors parameters specify how to deal with 836 non-ASCII characters, as accepted by the str.encode method. 837 By default, encoding='utf-8' (characters are encoded with UTF-8), and 838 errors='strict' (unsupported characters raise a UnicodeEncodeError). 839 """ 840 if isinstance(string, str): 841 if not string: 842 return string 843 if encoding is None: 844 encoding = 'utf-8' 845 if errors is None: 846 errors = 'strict' 847 string = string.encode(encoding, errors) 848 else: 849 if encoding is not None: 850 raise TypeError("quote() doesn't support 'encoding' for bytes") 851 if errors is not None: 852 raise TypeError("quote() doesn't support 'errors' for bytes") 853 return quote_from_bytes(string, safe) 854 855def quote_plus(string, safe='', encoding=None, errors=None): 856 """Like quote(), but also replace ' ' with '+', as required for quoting 857 HTML form values. Plus signs in the original string are escaped unless 858 they are included in safe. It also does not have safe default to '/'. 859 """ 860 # Check if ' ' in string, where string may either be a str or bytes. If 861 # there are no spaces, the regular quote will produce the right answer. 862 if ((isinstance(string, str) and ' ' not in string) or 863 (isinstance(string, bytes) and b' ' not in string)): 864 return quote(string, safe, encoding, errors) 865 if isinstance(safe, str): 866 space = ' ' 867 else: 868 space = b' ' 869 string = quote(string, safe + space, encoding, errors) 870 return string.replace(' ', '+') 871 872def quote_from_bytes(bs, safe='/'): 873 """Like quote(), but accepts a bytes object rather than a str, and does 874 not perform string-to-bytes encoding. It always returns an ASCII string. 875 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 876 """ 877 if not isinstance(bs, (bytes, bytearray)): 878 raise TypeError("quote_from_bytes() expected bytes") 879 if not bs: 880 return '' 881 if isinstance(safe, str): 882 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 883 safe = safe.encode('ascii', 'ignore') 884 else: 885 safe = bytes([c for c in safe if c < 128]) 886 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 887 return bs.decode() 888 try: 889 quoter = _safe_quoters[safe] 890 except KeyError: 891 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 892 return ''.join([quoter(char) for char in bs]) 893 894def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 895 quote_via=quote_plus): 896 """Encode a dict or sequence of two-element tuples into a URL query string. 897 898 If any values in the query arg are sequences and doseq is true, each 899 sequence element is converted to a separate parameter. 900 901 If the query arg is a sequence of two-element tuples, the order of the 902 parameters in the output will match the order of parameters in the 903 input. 904 905 The components of a query arg may each be either a string or a bytes type. 906 907 The safe, encoding, and errors parameters are passed down to the function 908 specified by quote_via (encoding and errors only if a component is a str). 909 """ 910 911 if hasattr(query, "items"): 912 query = query.items() 913 else: 914 # It's a bother at times that strings and string-like objects are 915 # sequences. 916 try: 917 # non-sequence items should not work with len() 918 # non-empty strings will fail this 919 if len(query) and not isinstance(query[0], tuple): 920 raise TypeError 921 # Zero-length sequences of all types will get here and succeed, 922 # but that's a minor nit. Since the original implementation 923 # allowed empty dicts that type of behavior probably should be 924 # preserved for consistency 925 except TypeError: 926 ty, va, tb = sys.exc_info() 927 raise TypeError("not a valid non-string sequence " 928 "or mapping object").with_traceback(tb) 929 930 l = [] 931 if not doseq: 932 for k, v in query: 933 if isinstance(k, bytes): 934 k = quote_via(k, safe) 935 else: 936 k = quote_via(str(k), safe, encoding, errors) 937 938 if isinstance(v, bytes): 939 v = quote_via(v, safe) 940 else: 941 v = quote_via(str(v), safe, encoding, errors) 942 l.append(k + '=' + v) 943 else: 944 for k, v in query: 945 if isinstance(k, bytes): 946 k = quote_via(k, safe) 947 else: 948 k = quote_via(str(k), safe, encoding, errors) 949 950 if isinstance(v, bytes): 951 v = quote_via(v, safe) 952 l.append(k + '=' + v) 953 elif isinstance(v, str): 954 v = quote_via(v, safe, encoding, errors) 955 l.append(k + '=' + v) 956 else: 957 try: 958 # Is this a sufficient test for sequence-ness? 959 x = len(v) 960 except TypeError: 961 # not a sequence 962 v = quote_via(str(v), safe, encoding, errors) 963 l.append(k + '=' + v) 964 else: 965 # loop over the sequence 966 for elt in v: 967 if isinstance(elt, bytes): 968 elt = quote_via(elt, safe) 969 else: 970 elt = quote_via(str(elt), safe, encoding, errors) 971 l.append(k + '=' + elt) 972 return '&'.join(l) 973 974 975def to_bytes(url): 976 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8", 977 DeprecationWarning, stacklevel=2) 978 return _to_bytes(url) 979 980 981def _to_bytes(url): 982 """to_bytes(u"URL") --> 'URL'.""" 983 # Most URL schemes require ASCII. If that changes, the conversion 984 # can be relaxed. 985 # XXX get rid of to_bytes() 986 if isinstance(url, str): 987 try: 988 url = url.encode("ASCII").decode() 989 except UnicodeError: 990 raise UnicodeError("URL " + repr(url) + 991 " contains non-ASCII characters") 992 return url 993 994 995def unwrap(url): 996 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'. 997 998 The string is returned unchanged if it's not a wrapped URL. 999 """ 1000 url = str(url).strip() 1001 if url[:1] == '<' and url[-1:] == '>': 1002 url = url[1:-1].strip() 1003 if url[:4] == 'URL:': 1004 url = url[4:].strip() 1005 return url 1006 1007 1008def splittype(url): 1009 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, " 1010 "use urllib.parse.urlparse() instead", 1011 DeprecationWarning, stacklevel=2) 1012 return _splittype(url) 1013 1014 1015_typeprog = None 1016def _splittype(url): 1017 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 1018 global _typeprog 1019 if _typeprog is None: 1020 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 1021 1022 match = _typeprog.match(url) 1023 if match: 1024 scheme, data = match.groups() 1025 return scheme.lower(), data 1026 return None, url 1027 1028 1029def splithost(url): 1030 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, " 1031 "use urllib.parse.urlparse() instead", 1032 DeprecationWarning, stacklevel=2) 1033 return _splithost(url) 1034 1035 1036_hostprog = None 1037def _splithost(url): 1038 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 1039 global _hostprog 1040 if _hostprog is None: 1041 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 1042 1043 match = _hostprog.match(url) 1044 if match: 1045 host_port, path = match.groups() 1046 if path and path[0] != '/': 1047 path = '/' + path 1048 return host_port, path 1049 return None, url 1050 1051 1052def splituser(host): 1053 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, " 1054 "use urllib.parse.urlparse() instead", 1055 DeprecationWarning, stacklevel=2) 1056 return _splituser(host) 1057 1058 1059def _splituser(host): 1060 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 1061 user, delim, host = host.rpartition('@') 1062 return (user if delim else None), host 1063 1064 1065def splitpasswd(user): 1066 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, " 1067 "use urllib.parse.urlparse() instead", 1068 DeprecationWarning, stacklevel=2) 1069 return _splitpasswd(user) 1070 1071 1072def _splitpasswd(user): 1073 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1074 user, delim, passwd = user.partition(':') 1075 return user, (passwd if delim else None) 1076 1077 1078def splitport(host): 1079 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, " 1080 "use urllib.parse.urlparse() instead", 1081 DeprecationWarning, stacklevel=2) 1082 return _splitport(host) 1083 1084 1085# splittag('/path#tag') --> '/path', 'tag' 1086_portprog = None 1087def _splitport(host): 1088 """splitport('host:port') --> 'host', 'port'.""" 1089 global _portprog 1090 if _portprog is None: 1091 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL) 1092 1093 match = _portprog.fullmatch(host) 1094 if match: 1095 host, port = match.groups() 1096 if port: 1097 return host, port 1098 return host, None 1099 1100 1101def splitnport(host, defport=-1): 1102 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, " 1103 "use urllib.parse.urlparse() instead", 1104 DeprecationWarning, stacklevel=2) 1105 return _splitnport(host, defport) 1106 1107 1108def _splitnport(host, defport=-1): 1109 """Split host and port, returning numeric port. 1110 Return given default port if no ':' found; defaults to -1. 1111 Return numerical port if a valid number are found after ':'. 1112 Return None if ':' but not a valid number.""" 1113 host, delim, port = host.rpartition(':') 1114 if not delim: 1115 host = port 1116 elif port: 1117 try: 1118 nport = int(port) 1119 except ValueError: 1120 nport = None 1121 return host, nport 1122 return host, defport 1123 1124 1125def splitquery(url): 1126 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, " 1127 "use urllib.parse.urlparse() instead", 1128 DeprecationWarning, stacklevel=2) 1129 return _splitquery(url) 1130 1131 1132def _splitquery(url): 1133 """splitquery('/path?query') --> '/path', 'query'.""" 1134 path, delim, query = url.rpartition('?') 1135 if delim: 1136 return path, query 1137 return url, None 1138 1139 1140def splittag(url): 1141 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, " 1142 "use urllib.parse.urlparse() instead", 1143 DeprecationWarning, stacklevel=2) 1144 return _splittag(url) 1145 1146 1147def _splittag(url): 1148 """splittag('/path#tag') --> '/path', 'tag'.""" 1149 path, delim, tag = url.rpartition('#') 1150 if delim: 1151 return path, tag 1152 return url, None 1153 1154 1155def splitattr(url): 1156 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, " 1157 "use urllib.parse.urlparse() instead", 1158 DeprecationWarning, stacklevel=2) 1159 return _splitattr(url) 1160 1161 1162def _splitattr(url): 1163 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1164 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1165 words = url.split(';') 1166 return words[0], words[1:] 1167 1168 1169def splitvalue(attr): 1170 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, " 1171 "use urllib.parse.parse_qsl() instead", 1172 DeprecationWarning, stacklevel=2) 1173 return _splitvalue(attr) 1174 1175 1176def _splitvalue(attr): 1177 """splitvalue('attr=value') --> 'attr', 'value'.""" 1178 attr, delim, value = attr.partition('=') 1179 return attr, (value if delim else None) 1180