1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28""" 29 30import re 31import sys 32import collections 33import warnings 34 35__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 36 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 37 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 38 "unquote", "unquote_plus", "unquote_to_bytes", 39 "DefragResult", "ParseResult", "SplitResult", 40 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 41 42# A classification of schemes. 43# The empty string classifies URLs with no scheme specified, 44# being the default value returned by “urlsplit” and “urlparse”. 45 46uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 47 'wais', 'file', 'https', 'shttp', 'mms', 48 'prospero', 'rtsp', 'rtspu', 'sftp', 49 'svn', 'svn+ssh', 'ws', 'wss'] 50 51uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 52 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 53 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 54 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 55 'ws', 'wss'] 56 57uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 58 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 59 'mms', 'sftp', 'tel'] 60 61# These are not actually used anymore, but should stay for backwards 62# compatibility. (They are undocumented, but have a public-looking name.) 63 64non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 65 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 66 67uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 68 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 69 70uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 71 'nntp', 'wais', 'https', 'shttp', 'snews', 72 'file', 'prospero'] 73 74# Characters valid in scheme names 75scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 76 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 77 '0123456789' 78 '+-.') 79 80# XXX: Consider replacing with functools.lru_cache 81MAX_CACHE_SIZE = 20 82_parse_cache = {} 83 84def clear_cache(): 85 """Clear the parse cache and the quoters cache.""" 86 _parse_cache.clear() 87 _safe_quoters.clear() 88 89 90# Helpers for bytes handling 91# For 3.2, we deliberately require applications that 92# handle improperly quoted URLs to do their own 93# decoding and encoding. If valid use cases are 94# presented, we may relax this by using latin-1 95# decoding internally for 3.3 96_implicit_encoding = 'ascii' 97_implicit_errors = 'strict' 98 99def _noop(obj): 100 return obj 101 102def _encode_result(obj, encoding=_implicit_encoding, 103 errors=_implicit_errors): 104 return obj.encode(encoding, errors) 105 106def _decode_args(args, encoding=_implicit_encoding, 107 errors=_implicit_errors): 108 return tuple(x.decode(encoding, errors) if x else '' for x in args) 109 110def _coerce_args(*args): 111 # Invokes decode if necessary to create str args 112 # and returns the coerced inputs along with 113 # an appropriate result coercion function 114 # - noop for str inputs 115 # - encoding function otherwise 116 str_input = isinstance(args[0], str) 117 for arg in args[1:]: 118 # We special-case the empty string to support the 119 # "scheme=''" default argument to some functions 120 if arg and isinstance(arg, str) != str_input: 121 raise TypeError("Cannot mix str and non-str arguments") 122 if str_input: 123 return args + (_noop,) 124 return _decode_args(args) + (_encode_result,) 125 126# Result objects are more helpful than simple tuples 127class _ResultMixinStr(object): 128 """Standard approach to encoding parsed results from str to bytes""" 129 __slots__ = () 130 131 def encode(self, encoding='ascii', errors='strict'): 132 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 133 134 135class _ResultMixinBytes(object): 136 """Standard approach to decoding parsed results from bytes to str""" 137 __slots__ = () 138 139 def decode(self, encoding='ascii', errors='strict'): 140 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 141 142 143class _NetlocResultMixinBase(object): 144 """Shared methods for the parsed result objects containing a netloc element""" 145 __slots__ = () 146 147 @property 148 def username(self): 149 return self._userinfo[0] 150 151 @property 152 def password(self): 153 return self._userinfo[1] 154 155 @property 156 def hostname(self): 157 hostname = self._hostinfo[0] 158 if not hostname: 159 return None 160 # Scoped IPv6 address may have zone info, which must not be lowercased 161 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 162 separator = '%' if isinstance(hostname, str) else b'%' 163 hostname, percent, zone = hostname.partition(separator) 164 return hostname.lower() + percent + zone 165 166 @property 167 def port(self): 168 port = self._hostinfo[1] 169 if port is not None: 170 try: 171 port = int(port, 10) 172 except ValueError: 173 message = f'Port could not be cast to integer value as {port!r}' 174 raise ValueError(message) from None 175 if not ( 0 <= port <= 65535): 176 raise ValueError("Port out of range 0-65535") 177 return port 178 179 180class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 181 __slots__ = () 182 183 @property 184 def _userinfo(self): 185 netloc = self.netloc 186 userinfo, have_info, hostinfo = netloc.rpartition('@') 187 if have_info: 188 username, have_password, password = userinfo.partition(':') 189 if not have_password: 190 password = None 191 else: 192 username = password = None 193 return username, password 194 195 @property 196 def _hostinfo(self): 197 netloc = self.netloc 198 _, _, hostinfo = netloc.rpartition('@') 199 _, have_open_br, bracketed = hostinfo.partition('[') 200 if have_open_br: 201 hostname, _, port = bracketed.partition(']') 202 _, _, port = port.partition(':') 203 else: 204 hostname, _, port = hostinfo.partition(':') 205 if not port: 206 port = None 207 return hostname, port 208 209 210class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 211 __slots__ = () 212 213 @property 214 def _userinfo(self): 215 netloc = self.netloc 216 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 217 if have_info: 218 username, have_password, password = userinfo.partition(b':') 219 if not have_password: 220 password = None 221 else: 222 username = password = None 223 return username, password 224 225 @property 226 def _hostinfo(self): 227 netloc = self.netloc 228 _, _, hostinfo = netloc.rpartition(b'@') 229 _, have_open_br, bracketed = hostinfo.partition(b'[') 230 if have_open_br: 231 hostname, _, port = bracketed.partition(b']') 232 _, _, port = port.partition(b':') 233 else: 234 hostname, _, port = hostinfo.partition(b':') 235 if not port: 236 port = None 237 return hostname, port 238 239 240from collections import namedtuple 241 242_DefragResultBase = namedtuple('DefragResult', 'url fragment') 243_SplitResultBase = namedtuple( 244 'SplitResult', 'scheme netloc path query fragment') 245_ParseResultBase = namedtuple( 246 'ParseResult', 'scheme netloc path params query fragment') 247 248_DefragResultBase.__doc__ = """ 249DefragResult(url, fragment) 250 251A 2-tuple that contains the url without fragment identifier and the fragment 252identifier as a separate argument. 253""" 254 255_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 256 257_DefragResultBase.fragment.__doc__ = """ 258Fragment identifier separated from URL, that allows indirect identification of a 259secondary resource by reference to a primary resource and additional identifying 260information. 261""" 262 263_SplitResultBase.__doc__ = """ 264SplitResult(scheme, netloc, path, query, fragment) 265 266A 5-tuple that contains the different components of a URL. Similar to 267ParseResult, but does not split params. 268""" 269 270_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 271 272_SplitResultBase.netloc.__doc__ = """ 273Network location where the request is made to. 274""" 275 276_SplitResultBase.path.__doc__ = """ 277The hierarchical path, such as the path to a file to download. 278""" 279 280_SplitResultBase.query.__doc__ = """ 281The query component, that contains non-hierarchical data, that along with data 282in path component, identifies a resource in the scope of URI's scheme and 283network location. 284""" 285 286_SplitResultBase.fragment.__doc__ = """ 287Fragment identifier, that allows indirect identification of a secondary resource 288by reference to a primary resource and additional identifying information. 289""" 290 291_ParseResultBase.__doc__ = """ 292ParseResult(scheme, netloc, path, params, query, fragment) 293 294A 6-tuple that contains components of a parsed URL. 295""" 296 297_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 298_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 299_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 300_ParseResultBase.params.__doc__ = """ 301Parameters for last path element used to dereference the URI in order to provide 302access to perform some operation on the resource. 303""" 304 305_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 306_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 307 308 309# For backwards compatibility, alias _NetlocResultMixinStr 310# ResultBase is no longer part of the documented API, but it is 311# retained since deprecating it isn't worth the hassle 312ResultBase = _NetlocResultMixinStr 313 314# Structured result objects for string data 315class DefragResult(_DefragResultBase, _ResultMixinStr): 316 __slots__ = () 317 def geturl(self): 318 if self.fragment: 319 return self.url + '#' + self.fragment 320 else: 321 return self.url 322 323class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 324 __slots__ = () 325 def geturl(self): 326 return urlunsplit(self) 327 328class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 329 __slots__ = () 330 def geturl(self): 331 return urlunparse(self) 332 333# Structured result objects for bytes data 334class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 335 __slots__ = () 336 def geturl(self): 337 if self.fragment: 338 return self.url + b'#' + self.fragment 339 else: 340 return self.url 341 342class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 343 __slots__ = () 344 def geturl(self): 345 return urlunsplit(self) 346 347class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 348 __slots__ = () 349 def geturl(self): 350 return urlunparse(self) 351 352# Set up the encode/decode result pairs 353def _fix_result_transcoding(): 354 _result_pairs = ( 355 (DefragResult, DefragResultBytes), 356 (SplitResult, SplitResultBytes), 357 (ParseResult, ParseResultBytes), 358 ) 359 for _decoded, _encoded in _result_pairs: 360 _decoded._encoded_counterpart = _encoded 361 _encoded._decoded_counterpart = _decoded 362 363_fix_result_transcoding() 364del _fix_result_transcoding 365 366def urlparse(url, scheme='', allow_fragments=True): 367 """Parse a URL into 6 components: 368 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 369 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 370 Note that we don't break the components up in smaller bits 371 (e.g. netloc is a single string) and we don't expand % escapes.""" 372 url, scheme, _coerce_result = _coerce_args(url, scheme) 373 splitresult = urlsplit(url, scheme, allow_fragments) 374 scheme, netloc, url, query, fragment = splitresult 375 if scheme in uses_params and ';' in url: 376 url, params = _splitparams(url) 377 else: 378 params = '' 379 result = ParseResult(scheme, netloc, url, params, query, fragment) 380 return _coerce_result(result) 381 382def _splitparams(url): 383 if '/' in url: 384 i = url.find(';', url.rfind('/')) 385 if i < 0: 386 return url, '' 387 else: 388 i = url.find(';') 389 return url[:i], url[i+1:] 390 391def _splitnetloc(url, start=0): 392 delim = len(url) # position of end of domain part of url, default is end 393 for c in '/?#': # look for delimiters; the order is NOT important 394 wdelim = url.find(c, start) # find first of this delim 395 if wdelim >= 0: # if found 396 delim = min(delim, wdelim) # use earliest delim position 397 return url[start:delim], url[delim:] # return (domain, rest) 398 399def _checknetloc(netloc): 400 if not netloc or netloc.isascii(): 401 return 402 # looking for characters like \u2100 that expand to 'a/c' 403 # IDNA uses NFKC equivalence, so normalize for this check 404 import unicodedata 405 n = netloc.replace('@', '') # ignore characters already included 406 n = n.replace(':', '') # but not the surrounding text 407 n = n.replace('#', '') 408 n = n.replace('?', '') 409 netloc2 = unicodedata.normalize('NFKC', n) 410 if n == netloc2: 411 return 412 for c in '/?#@:': 413 if c in netloc2: 414 raise ValueError("netloc '" + netloc + "' contains invalid " + 415 "characters under NFKC normalization") 416 417def urlsplit(url, scheme='', allow_fragments=True): 418 """Parse a URL into 5 components: 419 <scheme>://<netloc>/<path>?<query>#<fragment> 420 Return a 5-tuple: (scheme, netloc, path, query, fragment). 421 Note that we don't break the components up in smaller bits 422 (e.g. netloc is a single string) and we don't expand % escapes.""" 423 url, scheme, _coerce_result = _coerce_args(url, scheme) 424 allow_fragments = bool(allow_fragments) 425 key = url, scheme, allow_fragments, type(url), type(scheme) 426 cached = _parse_cache.get(key, None) 427 if cached: 428 return _coerce_result(cached) 429 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 430 clear_cache() 431 netloc = query = fragment = '' 432 i = url.find(':') 433 if i > 0: 434 for c in url[:i]: 435 if c not in scheme_chars: 436 break 437 else: 438 scheme, url = url[:i].lower(), url[i+1:] 439 440 if url[:2] == '//': 441 netloc, url = _splitnetloc(url, 2) 442 if (('[' in netloc and ']' not in netloc) or 443 (']' in netloc and '[' not in netloc)): 444 raise ValueError("Invalid IPv6 URL") 445 if allow_fragments and '#' in url: 446 url, fragment = url.split('#', 1) 447 if '?' in url: 448 url, query = url.split('?', 1) 449 _checknetloc(netloc) 450 v = SplitResult(scheme, netloc, url, query, fragment) 451 _parse_cache[key] = v 452 return _coerce_result(v) 453 454def urlunparse(components): 455 """Put a parsed URL back together again. This may result in a 456 slightly different, but equivalent URL, if the URL that was parsed 457 originally had redundant delimiters, e.g. a ? with an empty query 458 (the draft states that these are equivalent).""" 459 scheme, netloc, url, params, query, fragment, _coerce_result = ( 460 _coerce_args(*components)) 461 if params: 462 url = "%s;%s" % (url, params) 463 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 464 465def urlunsplit(components): 466 """Combine the elements of a tuple as returned by urlsplit() into a 467 complete URL as a string. The data argument can be any five-item iterable. 468 This may result in a slightly different, but equivalent URL, if the URL that 469 was parsed originally had unnecessary delimiters (for example, a ? with an 470 empty query; the RFC states that these are equivalent).""" 471 scheme, netloc, url, query, fragment, _coerce_result = ( 472 _coerce_args(*components)) 473 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 474 if url and url[:1] != '/': url = '/' + url 475 url = '//' + (netloc or '') + url 476 if scheme: 477 url = scheme + ':' + url 478 if query: 479 url = url + '?' + query 480 if fragment: 481 url = url + '#' + fragment 482 return _coerce_result(url) 483 484def urljoin(base, url, allow_fragments=True): 485 """Join a base URL and a possibly relative URL to form an absolute 486 interpretation of the latter.""" 487 if not base: 488 return url 489 if not url: 490 return base 491 492 base, url, _coerce_result = _coerce_args(base, url) 493 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 494 urlparse(base, '', allow_fragments) 495 scheme, netloc, path, params, query, fragment = \ 496 urlparse(url, bscheme, allow_fragments) 497 498 if scheme != bscheme or scheme not in uses_relative: 499 return _coerce_result(url) 500 if scheme in uses_netloc: 501 if netloc: 502 return _coerce_result(urlunparse((scheme, netloc, path, 503 params, query, fragment))) 504 netloc = bnetloc 505 506 if not path and not params: 507 path = bpath 508 params = bparams 509 if not query: 510 query = bquery 511 return _coerce_result(urlunparse((scheme, netloc, path, 512 params, query, fragment))) 513 514 base_parts = bpath.split('/') 515 if base_parts[-1] != '': 516 # the last item is not a directory, so will not be taken into account 517 # in resolving the relative path 518 del base_parts[-1] 519 520 # for rfc3986, ignore all base path should the first character be root. 521 if path[:1] == '/': 522 segments = path.split('/') 523 else: 524 segments = base_parts + path.split('/') 525 # filter out elements that would cause redundant slashes on re-joining 526 # the resolved_path 527 segments[1:-1] = filter(None, segments[1:-1]) 528 529 resolved_path = [] 530 531 for seg in segments: 532 if seg == '..': 533 try: 534 resolved_path.pop() 535 except IndexError: 536 # ignore any .. segments that would otherwise cause an IndexError 537 # when popped from resolved_path if resolving for rfc3986 538 pass 539 elif seg == '.': 540 continue 541 else: 542 resolved_path.append(seg) 543 544 if segments[-1] in ('.', '..'): 545 # do some post-processing here. if the last segment was a relative dir, 546 # then we need to append the trailing '/' 547 resolved_path.append('') 548 549 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 550 resolved_path) or '/', params, query, fragment))) 551 552 553def urldefrag(url): 554 """Removes any existing fragment from URL. 555 556 Returns a tuple of the defragmented URL and the fragment. If 557 the URL contained no fragments, the second element is the 558 empty string. 559 """ 560 url, _coerce_result = _coerce_args(url) 561 if '#' in url: 562 s, n, p, a, q, frag = urlparse(url) 563 defrag = urlunparse((s, n, p, a, q, '')) 564 else: 565 frag = '' 566 defrag = url 567 return _coerce_result(DefragResult(defrag, frag)) 568 569_hexdig = '0123456789ABCDEFabcdef' 570_hextobyte = None 571 572def unquote_to_bytes(string): 573 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 574 # Note: strings are encoded as UTF-8. This is only an issue if it contains 575 # unescaped non-ASCII characters, which URIs should not. 576 if not string: 577 # Is it a string-like object? 578 string.split 579 return b'' 580 if isinstance(string, str): 581 string = string.encode('utf-8') 582 bits = string.split(b'%') 583 if len(bits) == 1: 584 return string 585 res = [bits[0]] 586 append = res.append 587 # Delay the initialization of the table to not waste memory 588 # if the function is never called 589 global _hextobyte 590 if _hextobyte is None: 591 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) 592 for a in _hexdig for b in _hexdig} 593 for item in bits[1:]: 594 try: 595 append(_hextobyte[item[:2]]) 596 append(item[2:]) 597 except KeyError: 598 append(b'%') 599 append(item) 600 return b''.join(res) 601 602_asciire = re.compile('([\x00-\x7f]+)') 603 604def unquote(string, encoding='utf-8', errors='replace'): 605 """Replace %xx escapes by their single-character equivalent. The optional 606 encoding and errors parameters specify how to decode percent-encoded 607 sequences into Unicode characters, as accepted by the bytes.decode() 608 method. 609 By default, percent-encoded sequences are decoded with UTF-8, and invalid 610 sequences are replaced by a placeholder character. 611 612 unquote('abc%20def') -> 'abc def'. 613 """ 614 if '%' not in string: 615 string.split 616 return string 617 if encoding is None: 618 encoding = 'utf-8' 619 if errors is None: 620 errors = 'replace' 621 bits = _asciire.split(string) 622 res = [bits[0]] 623 append = res.append 624 for i in range(1, len(bits), 2): 625 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 626 append(bits[i + 1]) 627 return ''.join(res) 628 629 630def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 631 encoding='utf-8', errors='replace', max_num_fields=None): 632 """Parse a query given as a string argument. 633 634 Arguments: 635 636 qs: percent-encoded query string to be parsed 637 638 keep_blank_values: flag indicating whether blank values in 639 percent-encoded queries should be treated as blank strings. 640 A true value indicates that blanks should be retained as 641 blank strings. The default false value indicates that 642 blank values are to be ignored and treated as if they were 643 not included. 644 645 strict_parsing: flag indicating what to do with parsing errors. 646 If false (the default), errors are silently ignored. 647 If true, errors raise a ValueError exception. 648 649 encoding and errors: specify how to decode percent-encoded sequences 650 into Unicode characters, as accepted by the bytes.decode() method. 651 652 max_num_fields: int. If set, then throws a ValueError if there 653 are more than n fields read by parse_qsl(). 654 655 Returns a dictionary. 656 """ 657 parsed_result = {} 658 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 659 encoding=encoding, errors=errors, 660 max_num_fields=max_num_fields) 661 for name, value in pairs: 662 if name in parsed_result: 663 parsed_result[name].append(value) 664 else: 665 parsed_result[name] = [value] 666 return parsed_result 667 668 669def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 670 encoding='utf-8', errors='replace', max_num_fields=None): 671 """Parse a query given as a string argument. 672 673 Arguments: 674 675 qs: percent-encoded query string to be parsed 676 677 keep_blank_values: flag indicating whether blank values in 678 percent-encoded queries should be treated as blank strings. 679 A true value indicates that blanks should be retained as blank 680 strings. The default false value indicates that blank values 681 are to be ignored and treated as if they were not included. 682 683 strict_parsing: flag indicating what to do with parsing errors. If 684 false (the default), errors are silently ignored. If true, 685 errors raise a ValueError exception. 686 687 encoding and errors: specify how to decode percent-encoded sequences 688 into Unicode characters, as accepted by the bytes.decode() method. 689 690 max_num_fields: int. If set, then throws a ValueError 691 if there are more than n fields read by parse_qsl(). 692 693 Returns a list, as G-d intended. 694 """ 695 qs, _coerce_result = _coerce_args(qs) 696 697 # If max_num_fields is defined then check that the number of fields 698 # is less than max_num_fields. This prevents a memory exhaustion DOS 699 # attack via post bodies with many fields. 700 if max_num_fields is not None: 701 num_fields = 1 + qs.count('&') + qs.count(';') 702 if max_num_fields < num_fields: 703 raise ValueError('Max number of fields exceeded') 704 705 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 706 r = [] 707 for name_value in pairs: 708 if not name_value and not strict_parsing: 709 continue 710 nv = name_value.split('=', 1) 711 if len(nv) != 2: 712 if strict_parsing: 713 raise ValueError("bad query field: %r" % (name_value,)) 714 # Handle case of a control-name with no equal sign 715 if keep_blank_values: 716 nv.append('') 717 else: 718 continue 719 if len(nv[1]) or keep_blank_values: 720 name = nv[0].replace('+', ' ') 721 name = unquote(name, encoding=encoding, errors=errors) 722 name = _coerce_result(name) 723 value = nv[1].replace('+', ' ') 724 value = unquote(value, encoding=encoding, errors=errors) 725 value = _coerce_result(value) 726 r.append((name, value)) 727 return r 728 729def unquote_plus(string, encoding='utf-8', errors='replace'): 730 """Like unquote(), but also replace plus signs by spaces, as required for 731 unquoting HTML form values. 732 733 unquote_plus('%7e/abc+def') -> '~/abc def' 734 """ 735 string = string.replace('+', ' ') 736 return unquote(string, encoding, errors) 737 738_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 739 b'abcdefghijklmnopqrstuvwxyz' 740 b'0123456789' 741 b'_.-~') 742_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 743_safe_quoters = {} 744 745class Quoter(collections.defaultdict): 746 """A mapping from bytes (in range(0,256)) to strings. 747 748 String values are percent-encoded byte values, unless the key < 128, and 749 in the "safe" set (either the specified safe set, or default set). 750 """ 751 # Keeps a cache internally, using defaultdict, for efficiency (lookups 752 # of cached keys don't call Python code at all). 753 def __init__(self, safe): 754 """safe: bytes object.""" 755 self.safe = _ALWAYS_SAFE.union(safe) 756 757 def __repr__(self): 758 # Without this, will just display as a defaultdict 759 return "<%s %r>" % (self.__class__.__name__, dict(self)) 760 761 def __missing__(self, b): 762 # Handle a cache miss. Store quoted string in cache and return. 763 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 764 self[b] = res 765 return res 766 767def quote(string, safe='/', encoding=None, errors=None): 768 """quote('abc def') -> 'abc%20def' 769 770 Each part of a URL, e.g. the path info, the query, etc., has a 771 different set of reserved characters that must be quoted. The 772 quote function offers a cautious (not minimal) way to quote a 773 string for most of these parts. 774 775 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists 776 the following (un)reserved characters. 777 778 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 779 reserved = gen-delims / sub-delims 780 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 781 sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 782 / "*" / "+" / "," / ";" / "=" 783 784 Each of the reserved characters is reserved in some component of a URL, 785 but not necessarily in all of them. 786 787 The quote function %-escapes all characters that are neither in the 788 unreserved chars ("always safe") nor the additional chars set via the 789 safe arg. 790 791 The default for the safe arg is '/'. The character is reserved, but in 792 typical usage the quote function is being called on a path where the 793 existing slash characters are to be preserved. 794 795 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. 796 Now, "~" is included in the set of unreserved characters. 797 798 string and safe may be either str or bytes objects. encoding and errors 799 must not be specified if string is a bytes object. 800 801 The optional encoding and errors parameters specify how to deal with 802 non-ASCII characters, as accepted by the str.encode method. 803 By default, encoding='utf-8' (characters are encoded with UTF-8), and 804 errors='strict' (unsupported characters raise a UnicodeEncodeError). 805 """ 806 if isinstance(string, str): 807 if not string: 808 return string 809 if encoding is None: 810 encoding = 'utf-8' 811 if errors is None: 812 errors = 'strict' 813 string = string.encode(encoding, errors) 814 else: 815 if encoding is not None: 816 raise TypeError("quote() doesn't support 'encoding' for bytes") 817 if errors is not None: 818 raise TypeError("quote() doesn't support 'errors' for bytes") 819 return quote_from_bytes(string, safe) 820 821def quote_plus(string, safe='', encoding=None, errors=None): 822 """Like quote(), but also replace ' ' with '+', as required for quoting 823 HTML form values. Plus signs in the original string are escaped unless 824 they are included in safe. It also does not have safe default to '/'. 825 """ 826 # Check if ' ' in string, where string may either be a str or bytes. If 827 # there are no spaces, the regular quote will produce the right answer. 828 if ((isinstance(string, str) and ' ' not in string) or 829 (isinstance(string, bytes) and b' ' not in string)): 830 return quote(string, safe, encoding, errors) 831 if isinstance(safe, str): 832 space = ' ' 833 else: 834 space = b' ' 835 string = quote(string, safe + space, encoding, errors) 836 return string.replace(' ', '+') 837 838def quote_from_bytes(bs, safe='/'): 839 """Like quote(), but accepts a bytes object rather than a str, and does 840 not perform string-to-bytes encoding. It always returns an ASCII string. 841 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 842 """ 843 if not isinstance(bs, (bytes, bytearray)): 844 raise TypeError("quote_from_bytes() expected bytes") 845 if not bs: 846 return '' 847 if isinstance(safe, str): 848 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 849 safe = safe.encode('ascii', 'ignore') 850 else: 851 safe = bytes([c for c in safe if c < 128]) 852 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 853 return bs.decode() 854 try: 855 quoter = _safe_quoters[safe] 856 except KeyError: 857 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 858 return ''.join([quoter(char) for char in bs]) 859 860def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 861 quote_via=quote_plus): 862 """Encode a dict or sequence of two-element tuples into a URL query string. 863 864 If any values in the query arg are sequences and doseq is true, each 865 sequence element is converted to a separate parameter. 866 867 If the query arg is a sequence of two-element tuples, the order of the 868 parameters in the output will match the order of parameters in the 869 input. 870 871 The components of a query arg may each be either a string or a bytes type. 872 873 The safe, encoding, and errors parameters are passed down to the function 874 specified by quote_via (encoding and errors only if a component is a str). 875 """ 876 877 if hasattr(query, "items"): 878 query = query.items() 879 else: 880 # It's a bother at times that strings and string-like objects are 881 # sequences. 882 try: 883 # non-sequence items should not work with len() 884 # non-empty strings will fail this 885 if len(query) and not isinstance(query[0], tuple): 886 raise TypeError 887 # Zero-length sequences of all types will get here and succeed, 888 # but that's a minor nit. Since the original implementation 889 # allowed empty dicts that type of behavior probably should be 890 # preserved for consistency 891 except TypeError: 892 ty, va, tb = sys.exc_info() 893 raise TypeError("not a valid non-string sequence " 894 "or mapping object").with_traceback(tb) 895 896 l = [] 897 if not doseq: 898 for k, v in query: 899 if isinstance(k, bytes): 900 k = quote_via(k, safe) 901 else: 902 k = quote_via(str(k), safe, encoding, errors) 903 904 if isinstance(v, bytes): 905 v = quote_via(v, safe) 906 else: 907 v = quote_via(str(v), safe, encoding, errors) 908 l.append(k + '=' + v) 909 else: 910 for k, v in query: 911 if isinstance(k, bytes): 912 k = quote_via(k, safe) 913 else: 914 k = quote_via(str(k), safe, encoding, errors) 915 916 if isinstance(v, bytes): 917 v = quote_via(v, safe) 918 l.append(k + '=' + v) 919 elif isinstance(v, str): 920 v = quote_via(v, safe, encoding, errors) 921 l.append(k + '=' + v) 922 else: 923 try: 924 # Is this a sufficient test for sequence-ness? 925 x = len(v) 926 except TypeError: 927 # not a sequence 928 v = quote_via(str(v), safe, encoding, errors) 929 l.append(k + '=' + v) 930 else: 931 # loop over the sequence 932 for elt in v: 933 if isinstance(elt, bytes): 934 elt = quote_via(elt, safe) 935 else: 936 elt = quote_via(str(elt), safe, encoding, errors) 937 l.append(k + '=' + elt) 938 return '&'.join(l) 939 940 941def to_bytes(url): 942 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8", 943 DeprecationWarning, stacklevel=2) 944 return _to_bytes(url) 945 946 947def _to_bytes(url): 948 """to_bytes(u"URL") --> 'URL'.""" 949 # Most URL schemes require ASCII. If that changes, the conversion 950 # can be relaxed. 951 # XXX get rid of to_bytes() 952 if isinstance(url, str): 953 try: 954 url = url.encode("ASCII").decode() 955 except UnicodeError: 956 raise UnicodeError("URL " + repr(url) + 957 " contains non-ASCII characters") 958 return url 959 960 961def unwrap(url): 962 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'. 963 964 The string is returned unchanged if it's not a wrapped URL. 965 """ 966 url = str(url).strip() 967 if url[:1] == '<' and url[-1:] == '>': 968 url = url[1:-1].strip() 969 if url[:4] == 'URL:': 970 url = url[4:].strip() 971 return url 972 973 974def splittype(url): 975 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, " 976 "use urllib.parse.urlparse() instead", 977 DeprecationWarning, stacklevel=2) 978 return _splittype(url) 979 980 981_typeprog = None 982def _splittype(url): 983 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 984 global _typeprog 985 if _typeprog is None: 986 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 987 988 match = _typeprog.match(url) 989 if match: 990 scheme, data = match.groups() 991 return scheme.lower(), data 992 return None, url 993 994 995def splithost(url): 996 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, " 997 "use urllib.parse.urlparse() instead", 998 DeprecationWarning, stacklevel=2) 999 return _splithost(url) 1000 1001 1002_hostprog = None 1003def _splithost(url): 1004 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 1005 global _hostprog 1006 if _hostprog is None: 1007 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 1008 1009 match = _hostprog.match(url) 1010 if match: 1011 host_port, path = match.groups() 1012 if path and path[0] != '/': 1013 path = '/' + path 1014 return host_port, path 1015 return None, url 1016 1017 1018def splituser(host): 1019 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, " 1020 "use urllib.parse.urlparse() instead", 1021 DeprecationWarning, stacklevel=2) 1022 return _splituser(host) 1023 1024 1025def _splituser(host): 1026 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 1027 user, delim, host = host.rpartition('@') 1028 return (user if delim else None), host 1029 1030 1031def splitpasswd(user): 1032 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, " 1033 "use urllib.parse.urlparse() instead", 1034 DeprecationWarning, stacklevel=2) 1035 return _splitpasswd(user) 1036 1037 1038def _splitpasswd(user): 1039 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1040 user, delim, passwd = user.partition(':') 1041 return user, (passwd if delim else None) 1042 1043 1044def splitport(host): 1045 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, " 1046 "use urllib.parse.urlparse() instead", 1047 DeprecationWarning, stacklevel=2) 1048 return _splitport(host) 1049 1050 1051# splittag('/path#tag') --> '/path', 'tag' 1052_portprog = None 1053def _splitport(host): 1054 """splitport('host:port') --> 'host', 'port'.""" 1055 global _portprog 1056 if _portprog is None: 1057 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) 1058 1059 match = _portprog.match(host) 1060 if match: 1061 host, port = match.groups() 1062 if port: 1063 return host, port 1064 return host, None 1065 1066 1067def splitnport(host, defport=-1): 1068 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, " 1069 "use urllib.parse.urlparse() instead", 1070 DeprecationWarning, stacklevel=2) 1071 return _splitnport(host, defport) 1072 1073 1074def _splitnport(host, defport=-1): 1075 """Split host and port, returning numeric port. 1076 Return given default port if no ':' found; defaults to -1. 1077 Return numerical port if a valid number are found after ':'. 1078 Return None if ':' but not a valid number.""" 1079 host, delim, port = host.rpartition(':') 1080 if not delim: 1081 host = port 1082 elif port: 1083 try: 1084 nport = int(port) 1085 except ValueError: 1086 nport = None 1087 return host, nport 1088 return host, defport 1089 1090 1091def splitquery(url): 1092 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, " 1093 "use urllib.parse.urlparse() instead", 1094 DeprecationWarning, stacklevel=2) 1095 return _splitquery(url) 1096 1097 1098def _splitquery(url): 1099 """splitquery('/path?query') --> '/path', 'query'.""" 1100 path, delim, query = url.rpartition('?') 1101 if delim: 1102 return path, query 1103 return url, None 1104 1105 1106def splittag(url): 1107 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, " 1108 "use urllib.parse.urlparse() instead", 1109 DeprecationWarning, stacklevel=2) 1110 return _splittag(url) 1111 1112 1113def _splittag(url): 1114 """splittag('/path#tag') --> '/path', 'tag'.""" 1115 path, delim, tag = url.rpartition('#') 1116 if delim: 1117 return path, tag 1118 return url, None 1119 1120 1121def splitattr(url): 1122 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, " 1123 "use urllib.parse.urlparse() instead", 1124 DeprecationWarning, stacklevel=2) 1125 return _splitattr(url) 1126 1127 1128def _splitattr(url): 1129 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1130 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1131 words = url.split(';') 1132 return words[0], words[1:] 1133 1134 1135def splitvalue(attr): 1136 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, " 1137 "use urllib.parse.parse_qsl() instead", 1138 DeprecationWarning, stacklevel=2) 1139 return _splitvalue(attr) 1140 1141 1142def _splitvalue(attr): 1143 """splitvalue('attr=value') --> 'attr', 'value'.""" 1144 attr, delim, value = attr.partition('=') 1145 return attr, (value if delim else None) 1146