1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28""" 29 30import re 31import sys 32import collections 33 34__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 35 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 37 "unquote", "unquote_plus", "unquote_to_bytes", 38 "DefragResult", "ParseResult", "SplitResult", 39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 40 41# A classification of schemes. 42# The empty string classifies URLs with no scheme specified, 43# being the default value returned by “urlsplit” and “urlparse”. 44 45uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap', 46 'wais', 'file', 'https', 'shttp', 'mms', 47 'prospero', 'rtsp', 'rtspu', 'sftp', 48 'svn', 'svn+ssh', 'ws', 'wss'] 49 50uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet', 51 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 52 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', 53 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 54 'ws', 'wss'] 55 56uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap', 57 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 58 'mms', 'sftp', 'tel'] 59 60# These are not actually used anymore, but should stay for backwards 61# compatibility. (They are undocumented, but have a public-looking name.) 62 63non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 64 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 65 66uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms', 67 'gopher', 'rtsp', 'rtspu', 'sip', 'sips'] 68 69uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news', 70 'nntp', 'wais', 'https', 'shttp', 'snews', 71 'file', 'prospero'] 72 73# Characters valid in scheme names 74scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 75 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 76 '0123456789' 77 '+-.') 78 79# XXX: Consider replacing with functools.lru_cache 80MAX_CACHE_SIZE = 20 81_parse_cache = {} 82 83def clear_cache(): 84 """Clear the parse cache and the quoters cache.""" 85 _parse_cache.clear() 86 _safe_quoters.clear() 87 88 89# Helpers for bytes handling 90# For 3.2, we deliberately require applications that 91# handle improperly quoted URLs to do their own 92# decoding and encoding. If valid use cases are 93# presented, we may relax this by using latin-1 94# decoding internally for 3.3 95_implicit_encoding = 'ascii' 96_implicit_errors = 'strict' 97 98def _noop(obj): 99 return obj 100 101def _encode_result(obj, encoding=_implicit_encoding, 102 errors=_implicit_errors): 103 return obj.encode(encoding, errors) 104 105def _decode_args(args, encoding=_implicit_encoding, 106 errors=_implicit_errors): 107 return tuple(x.decode(encoding, errors) if x else '' for x in args) 108 109def _coerce_args(*args): 110 # Invokes decode if necessary to create str args 111 # and returns the coerced inputs along with 112 # an appropriate result coercion function 113 # - noop for str inputs 114 # - encoding function otherwise 115 str_input = isinstance(args[0], str) 116 for arg in args[1:]: 117 # We special-case the empty string to support the 118 # "scheme=''" default argument to some functions 119 if arg and isinstance(arg, str) != str_input: 120 raise TypeError("Cannot mix str and non-str arguments") 121 if str_input: 122 return args + (_noop,) 123 return _decode_args(args) + (_encode_result,) 124 125# Result objects are more helpful than simple tuples 126class _ResultMixinStr(object): 127 """Standard approach to encoding parsed results from str to bytes""" 128 __slots__ = () 129 130 def encode(self, encoding='ascii', errors='strict'): 131 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 132 133 134class _ResultMixinBytes(object): 135 """Standard approach to decoding parsed results from bytes to str""" 136 __slots__ = () 137 138 def decode(self, encoding='ascii', errors='strict'): 139 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 140 141 142class _NetlocResultMixinBase(object): 143 """Shared methods for the parsed result objects containing a netloc element""" 144 __slots__ = () 145 146 @property 147 def username(self): 148 return self._userinfo[0] 149 150 @property 151 def password(self): 152 return self._userinfo[1] 153 154 @property 155 def hostname(self): 156 hostname = self._hostinfo[0] 157 if not hostname: 158 return None 159 # Scoped IPv6 address may have zone info, which must not be lowercased 160 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys 161 separator = '%' if isinstance(hostname, str) else b'%' 162 hostname, percent, zone = hostname.partition(separator) 163 return hostname.lower() + percent + zone 164 165 @property 166 def port(self): 167 port = self._hostinfo[1] 168 if port is not None: 169 port = int(port, 10) 170 if not ( 0 <= port <= 65535): 171 raise ValueError("Port out of range 0-65535") 172 return port 173 174 175class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 176 __slots__ = () 177 178 @property 179 def _userinfo(self): 180 netloc = self.netloc 181 userinfo, have_info, hostinfo = netloc.rpartition('@') 182 if have_info: 183 username, have_password, password = userinfo.partition(':') 184 if not have_password: 185 password = None 186 else: 187 username = password = None 188 return username, password 189 190 @property 191 def _hostinfo(self): 192 netloc = self.netloc 193 _, _, hostinfo = netloc.rpartition('@') 194 _, have_open_br, bracketed = hostinfo.partition('[') 195 if have_open_br: 196 hostname, _, port = bracketed.partition(']') 197 _, _, port = port.partition(':') 198 else: 199 hostname, _, port = hostinfo.partition(':') 200 if not port: 201 port = None 202 return hostname, port 203 204 205class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 206 __slots__ = () 207 208 @property 209 def _userinfo(self): 210 netloc = self.netloc 211 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 212 if have_info: 213 username, have_password, password = userinfo.partition(b':') 214 if not have_password: 215 password = None 216 else: 217 username = password = None 218 return username, password 219 220 @property 221 def _hostinfo(self): 222 netloc = self.netloc 223 _, _, hostinfo = netloc.rpartition(b'@') 224 _, have_open_br, bracketed = hostinfo.partition(b'[') 225 if have_open_br: 226 hostname, _, port = bracketed.partition(b']') 227 _, _, port = port.partition(b':') 228 else: 229 hostname, _, port = hostinfo.partition(b':') 230 if not port: 231 port = None 232 return hostname, port 233 234 235from collections import namedtuple 236 237_DefragResultBase = namedtuple('DefragResult', 'url fragment') 238_SplitResultBase = namedtuple( 239 'SplitResult', 'scheme netloc path query fragment') 240_ParseResultBase = namedtuple( 241 'ParseResult', 'scheme netloc path params query fragment') 242 243_DefragResultBase.__doc__ = """ 244DefragResult(url, fragment) 245 246A 2-tuple that contains the url without fragment identifier and the fragment 247identifier as a separate argument. 248""" 249 250_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 251 252_DefragResultBase.fragment.__doc__ = """ 253Fragment identifier separated from URL, that allows indirect identification of a 254secondary resource by reference to a primary resource and additional identifying 255information. 256""" 257 258_SplitResultBase.__doc__ = """ 259SplitResult(scheme, netloc, path, query, fragment) 260 261A 5-tuple that contains the different components of a URL. Similar to 262ParseResult, but does not split params. 263""" 264 265_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 266 267_SplitResultBase.netloc.__doc__ = """ 268Network location where the request is made to. 269""" 270 271_SplitResultBase.path.__doc__ = """ 272The hierarchical path, such as the path to a file to download. 273""" 274 275_SplitResultBase.query.__doc__ = """ 276The query component, that contains non-hierarchical data, that along with data 277in path component, identifies a resource in the scope of URI's scheme and 278network location. 279""" 280 281_SplitResultBase.fragment.__doc__ = """ 282Fragment identifier, that allows indirect identification of a secondary resource 283by reference to a primary resource and additional identifying information. 284""" 285 286_ParseResultBase.__doc__ = """ 287ParseResult(scheme, netloc, path, params, query, fragment) 288 289A 6-tuple that contains components of a parsed URL. 290""" 291 292_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 293_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 294_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 295_ParseResultBase.params.__doc__ = """ 296Parameters for last path element used to dereference the URI in order to provide 297access to perform some operation on the resource. 298""" 299 300_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 301_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 302 303 304# For backwards compatibility, alias _NetlocResultMixinStr 305# ResultBase is no longer part of the documented API, but it is 306# retained since deprecating it isn't worth the hassle 307ResultBase = _NetlocResultMixinStr 308 309# Structured result objects for string data 310class DefragResult(_DefragResultBase, _ResultMixinStr): 311 __slots__ = () 312 def geturl(self): 313 if self.fragment: 314 return self.url + '#' + self.fragment 315 else: 316 return self.url 317 318class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 319 __slots__ = () 320 def geturl(self): 321 return urlunsplit(self) 322 323class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 324 __slots__ = () 325 def geturl(self): 326 return urlunparse(self) 327 328# Structured result objects for bytes data 329class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 330 __slots__ = () 331 def geturl(self): 332 if self.fragment: 333 return self.url + b'#' + self.fragment 334 else: 335 return self.url 336 337class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 338 __slots__ = () 339 def geturl(self): 340 return urlunsplit(self) 341 342class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 343 __slots__ = () 344 def geturl(self): 345 return urlunparse(self) 346 347# Set up the encode/decode result pairs 348def _fix_result_transcoding(): 349 _result_pairs = ( 350 (DefragResult, DefragResultBytes), 351 (SplitResult, SplitResultBytes), 352 (ParseResult, ParseResultBytes), 353 ) 354 for _decoded, _encoded in _result_pairs: 355 _decoded._encoded_counterpart = _encoded 356 _encoded._decoded_counterpart = _decoded 357 358_fix_result_transcoding() 359del _fix_result_transcoding 360 361def urlparse(url, scheme='', allow_fragments=True): 362 """Parse a URL into 6 components: 363 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 364 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 365 Note that we don't break the components up in smaller bits 366 (e.g. netloc is a single string) and we don't expand % escapes.""" 367 url, scheme, _coerce_result = _coerce_args(url, scheme) 368 splitresult = urlsplit(url, scheme, allow_fragments) 369 scheme, netloc, url, query, fragment = splitresult 370 if scheme in uses_params and ';' in url: 371 url, params = _splitparams(url) 372 else: 373 params = '' 374 result = ParseResult(scheme, netloc, url, params, query, fragment) 375 return _coerce_result(result) 376 377def _splitparams(url): 378 if '/' in url: 379 i = url.find(';', url.rfind('/')) 380 if i < 0: 381 return url, '' 382 else: 383 i = url.find(';') 384 return url[:i], url[i+1:] 385 386def _splitnetloc(url, start=0): 387 delim = len(url) # position of end of domain part of url, default is end 388 for c in '/?#': # look for delimiters; the order is NOT important 389 wdelim = url.find(c, start) # find first of this delim 390 if wdelim >= 0: # if found 391 delim = min(delim, wdelim) # use earliest delim position 392 return url[start:delim], url[delim:] # return (domain, rest) 393 394def _checknetloc(netloc): 395 if not netloc or netloc.isascii(): 396 return 397 # looking for characters like \u2100 that expand to 'a/c' 398 # IDNA uses NFKC equivalence, so normalize for this check 399 import unicodedata 400 netloc2 = unicodedata.normalize('NFKC', netloc) 401 if netloc == netloc2: 402 return 403 _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay 404 for c in '/?#@:': 405 if c in netloc2: 406 raise ValueError("netloc '" + netloc2 + "' contains invalid " + 407 "characters under NFKC normalization") 408 409def urlsplit(url, scheme='', allow_fragments=True): 410 """Parse a URL into 5 components: 411 <scheme>://<netloc>/<path>?<query>#<fragment> 412 Return a 5-tuple: (scheme, netloc, path, query, fragment). 413 Note that we don't break the components up in smaller bits 414 (e.g. netloc is a single string) and we don't expand % escapes.""" 415 url, scheme, _coerce_result = _coerce_args(url, scheme) 416 allow_fragments = bool(allow_fragments) 417 key = url, scheme, allow_fragments, type(url), type(scheme) 418 cached = _parse_cache.get(key, None) 419 if cached: 420 return _coerce_result(cached) 421 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 422 clear_cache() 423 netloc = query = fragment = '' 424 i = url.find(':') 425 if i > 0: 426 if url[:i] == 'http': # optimize the common case 427 url = url[i+1:] 428 if url[:2] == '//': 429 netloc, url = _splitnetloc(url, 2) 430 if (('[' in netloc and ']' not in netloc) or 431 (']' in netloc and '[' not in netloc)): 432 raise ValueError("Invalid IPv6 URL") 433 if allow_fragments and '#' in url: 434 url, fragment = url.split('#', 1) 435 if '?' in url: 436 url, query = url.split('?', 1) 437 _checknetloc(netloc) 438 v = SplitResult('http', netloc, url, query, fragment) 439 _parse_cache[key] = v 440 return _coerce_result(v) 441 for c in url[:i]: 442 if c not in scheme_chars: 443 break 444 else: 445 # make sure "url" is not actually a port number (in which case 446 # "scheme" is really part of the path) 447 rest = url[i+1:] 448 if not rest or any(c not in '0123456789' for c in rest): 449 # not a port number 450 scheme, url = url[:i].lower(), rest 451 452 if url[:2] == '//': 453 netloc, url = _splitnetloc(url, 2) 454 if (('[' in netloc and ']' not in netloc) or 455 (']' in netloc and '[' not in netloc)): 456 raise ValueError("Invalid IPv6 URL") 457 if allow_fragments and '#' in url: 458 url, fragment = url.split('#', 1) 459 if '?' in url: 460 url, query = url.split('?', 1) 461 _checknetloc(netloc) 462 v = SplitResult(scheme, netloc, url, query, fragment) 463 _parse_cache[key] = v 464 return _coerce_result(v) 465 466def urlunparse(components): 467 """Put a parsed URL back together again. This may result in a 468 slightly different, but equivalent URL, if the URL that was parsed 469 originally had redundant delimiters, e.g. a ? with an empty query 470 (the draft states that these are equivalent).""" 471 scheme, netloc, url, params, query, fragment, _coerce_result = ( 472 _coerce_args(*components)) 473 if params: 474 url = "%s;%s" % (url, params) 475 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 476 477def urlunsplit(components): 478 """Combine the elements of a tuple as returned by urlsplit() into a 479 complete URL as a string. The data argument can be any five-item iterable. 480 This may result in a slightly different, but equivalent URL, if the URL that 481 was parsed originally had unnecessary delimiters (for example, a ? with an 482 empty query; the RFC states that these are equivalent).""" 483 scheme, netloc, url, query, fragment, _coerce_result = ( 484 _coerce_args(*components)) 485 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 486 if url and url[:1] != '/': url = '/' + url 487 url = '//' + (netloc or '') + url 488 if scheme: 489 url = scheme + ':' + url 490 if query: 491 url = url + '?' + query 492 if fragment: 493 url = url + '#' + fragment 494 return _coerce_result(url) 495 496def urljoin(base, url, allow_fragments=True): 497 """Join a base URL and a possibly relative URL to form an absolute 498 interpretation of the latter.""" 499 if not base: 500 return url 501 if not url: 502 return base 503 504 base, url, _coerce_result = _coerce_args(base, url) 505 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 506 urlparse(base, '', allow_fragments) 507 scheme, netloc, path, params, query, fragment = \ 508 urlparse(url, bscheme, allow_fragments) 509 510 if scheme != bscheme or scheme not in uses_relative: 511 return _coerce_result(url) 512 if scheme in uses_netloc: 513 if netloc: 514 return _coerce_result(urlunparse((scheme, netloc, path, 515 params, query, fragment))) 516 netloc = bnetloc 517 518 if not path and not params: 519 path = bpath 520 params = bparams 521 if not query: 522 query = bquery 523 return _coerce_result(urlunparse((scheme, netloc, path, 524 params, query, fragment))) 525 526 base_parts = bpath.split('/') 527 if base_parts[-1] != '': 528 # the last item is not a directory, so will not be taken into account 529 # in resolving the relative path 530 del base_parts[-1] 531 532 # for rfc3986, ignore all base path should the first character be root. 533 if path[:1] == '/': 534 segments = path.split('/') 535 else: 536 segments = base_parts + path.split('/') 537 # filter out elements that would cause redundant slashes on re-joining 538 # the resolved_path 539 segments[1:-1] = filter(None, segments[1:-1]) 540 541 resolved_path = [] 542 543 for seg in segments: 544 if seg == '..': 545 try: 546 resolved_path.pop() 547 except IndexError: 548 # ignore any .. segments that would otherwise cause an IndexError 549 # when popped from resolved_path if resolving for rfc3986 550 pass 551 elif seg == '.': 552 continue 553 else: 554 resolved_path.append(seg) 555 556 if segments[-1] in ('.', '..'): 557 # do some post-processing here. if the last segment was a relative dir, 558 # then we need to append the trailing '/' 559 resolved_path.append('') 560 561 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 562 resolved_path) or '/', params, query, fragment))) 563 564 565def urldefrag(url): 566 """Removes any existing fragment from URL. 567 568 Returns a tuple of the defragmented URL and the fragment. If 569 the URL contained no fragments, the second element is the 570 empty string. 571 """ 572 url, _coerce_result = _coerce_args(url) 573 if '#' in url: 574 s, n, p, a, q, frag = urlparse(url) 575 defrag = urlunparse((s, n, p, a, q, '')) 576 else: 577 frag = '' 578 defrag = url 579 return _coerce_result(DefragResult(defrag, frag)) 580 581_hexdig = '0123456789ABCDEFabcdef' 582_hextobyte = None 583 584def unquote_to_bytes(string): 585 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 586 # Note: strings are encoded as UTF-8. This is only an issue if it contains 587 # unescaped non-ASCII characters, which URIs should not. 588 if not string: 589 # Is it a string-like object? 590 string.split 591 return b'' 592 if isinstance(string, str): 593 string = string.encode('utf-8') 594 bits = string.split(b'%') 595 if len(bits) == 1: 596 return string 597 res = [bits[0]] 598 append = res.append 599 # Delay the initialization of the table to not waste memory 600 # if the function is never called 601 global _hextobyte 602 if _hextobyte is None: 603 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b) 604 for a in _hexdig for b in _hexdig} 605 for item in bits[1:]: 606 try: 607 append(_hextobyte[item[:2]]) 608 append(item[2:]) 609 except KeyError: 610 append(b'%') 611 append(item) 612 return b''.join(res) 613 614_asciire = re.compile('([\x00-\x7f]+)') 615 616def unquote(string, encoding='utf-8', errors='replace'): 617 """Replace %xx escapes by their single-character equivalent. The optional 618 encoding and errors parameters specify how to decode percent-encoded 619 sequences into Unicode characters, as accepted by the bytes.decode() 620 method. 621 By default, percent-encoded sequences are decoded with UTF-8, and invalid 622 sequences are replaced by a placeholder character. 623 624 unquote('abc%20def') -> 'abc def'. 625 """ 626 if '%' not in string: 627 string.split 628 return string 629 if encoding is None: 630 encoding = 'utf-8' 631 if errors is None: 632 errors = 'replace' 633 bits = _asciire.split(string) 634 res = [bits[0]] 635 append = res.append 636 for i in range(1, len(bits), 2): 637 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 638 append(bits[i + 1]) 639 return ''.join(res) 640 641 642def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 643 encoding='utf-8', errors='replace', max_num_fields=None): 644 """Parse a query given as a string argument. 645 646 Arguments: 647 648 qs: percent-encoded query string to be parsed 649 650 keep_blank_values: flag indicating whether blank values in 651 percent-encoded queries should be treated as blank strings. 652 A true value indicates that blanks should be retained as 653 blank strings. The default false value indicates that 654 blank values are to be ignored and treated as if they were 655 not included. 656 657 strict_parsing: flag indicating what to do with parsing errors. 658 If false (the default), errors are silently ignored. 659 If true, errors raise a ValueError exception. 660 661 encoding and errors: specify how to decode percent-encoded sequences 662 into Unicode characters, as accepted by the bytes.decode() method. 663 664 max_num_fields: int. If set, then throws a ValueError if there 665 are more than n fields read by parse_qsl(). 666 667 Returns a dictionary. 668 """ 669 parsed_result = {} 670 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 671 encoding=encoding, errors=errors, 672 max_num_fields=max_num_fields) 673 for name, value in pairs: 674 if name in parsed_result: 675 parsed_result[name].append(value) 676 else: 677 parsed_result[name] = [value] 678 return parsed_result 679 680 681def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 682 encoding='utf-8', errors='replace', max_num_fields=None): 683 """Parse a query given as a string argument. 684 685 Arguments: 686 687 qs: percent-encoded query string to be parsed 688 689 keep_blank_values: flag indicating whether blank values in 690 percent-encoded queries should be treated as blank strings. 691 A true value indicates that blanks should be retained as blank 692 strings. The default false value indicates that blank values 693 are to be ignored and treated as if they were not included. 694 695 strict_parsing: flag indicating what to do with parsing errors. If 696 false (the default), errors are silently ignored. If true, 697 errors raise a ValueError exception. 698 699 encoding and errors: specify how to decode percent-encoded sequences 700 into Unicode characters, as accepted by the bytes.decode() method. 701 702 max_num_fields: int. If set, then throws a ValueError 703 if there are more than n fields read by parse_qsl(). 704 705 Returns a list, as G-d intended. 706 """ 707 qs, _coerce_result = _coerce_args(qs) 708 709 # If max_num_fields is defined then check that the number of fields 710 # is less than max_num_fields. This prevents a memory exhaustion DOS 711 # attack via post bodies with many fields. 712 if max_num_fields is not None: 713 num_fields = 1 + qs.count('&') + qs.count(';') 714 if max_num_fields < num_fields: 715 raise ValueError('Max number of fields exceeded') 716 717 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 718 r = [] 719 for name_value in pairs: 720 if not name_value and not strict_parsing: 721 continue 722 nv = name_value.split('=', 1) 723 if len(nv) != 2: 724 if strict_parsing: 725 raise ValueError("bad query field: %r" % (name_value,)) 726 # Handle case of a control-name with no equal sign 727 if keep_blank_values: 728 nv.append('') 729 else: 730 continue 731 if len(nv[1]) or keep_blank_values: 732 name = nv[0].replace('+', ' ') 733 name = unquote(name, encoding=encoding, errors=errors) 734 name = _coerce_result(name) 735 value = nv[1].replace('+', ' ') 736 value = unquote(value, encoding=encoding, errors=errors) 737 value = _coerce_result(value) 738 r.append((name, value)) 739 return r 740 741def unquote_plus(string, encoding='utf-8', errors='replace'): 742 """Like unquote(), but also replace plus signs by spaces, as required for 743 unquoting HTML form values. 744 745 unquote_plus('%7e/abc+def') -> '~/abc def' 746 """ 747 string = string.replace('+', ' ') 748 return unquote(string, encoding, errors) 749 750_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 751 b'abcdefghijklmnopqrstuvwxyz' 752 b'0123456789' 753 b'_.-~') 754_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 755_safe_quoters = {} 756 757class Quoter(collections.defaultdict): 758 """A mapping from bytes (in range(0,256)) to strings. 759 760 String values are percent-encoded byte values, unless the key < 128, and 761 in the "safe" set (either the specified safe set, or default set). 762 """ 763 # Keeps a cache internally, using defaultdict, for efficiency (lookups 764 # of cached keys don't call Python code at all). 765 def __init__(self, safe): 766 """safe: bytes object.""" 767 self.safe = _ALWAYS_SAFE.union(safe) 768 769 def __repr__(self): 770 # Without this, will just display as a defaultdict 771 return "<%s %r>" % (self.__class__.__name__, dict(self)) 772 773 def __missing__(self, b): 774 # Handle a cache miss. Store quoted string in cache and return. 775 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 776 self[b] = res 777 return res 778 779def quote(string, safe='/', encoding=None, errors=None): 780 """quote('abc def') -> 'abc%20def' 781 782 Each part of a URL, e.g. the path info, the query, etc., has a 783 different set of reserved characters that must be quoted. 784 785 RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists 786 the following reserved characters. 787 788 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 789 "$" | "," | "~" 790 791 Each of these characters is reserved in some component of a URL, 792 but not necessarily in all of them. 793 794 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. 795 Now, "~" is included in the set of reserved characters. 796 797 By default, the quote function is intended for quoting the path 798 section of a URL. Thus, it will not encode '/'. This character 799 is reserved, but in typical usage the quote function is being 800 called on a path where the existing slash characters are used as 801 reserved characters. 802 803 string and safe may be either str or bytes objects. encoding and errors 804 must not be specified if string is a bytes object. 805 806 The optional encoding and errors parameters specify how to deal with 807 non-ASCII characters, as accepted by the str.encode method. 808 By default, encoding='utf-8' (characters are encoded with UTF-8), and 809 errors='strict' (unsupported characters raise a UnicodeEncodeError). 810 """ 811 if isinstance(string, str): 812 if not string: 813 return string 814 if encoding is None: 815 encoding = 'utf-8' 816 if errors is None: 817 errors = 'strict' 818 string = string.encode(encoding, errors) 819 else: 820 if encoding is not None: 821 raise TypeError("quote() doesn't support 'encoding' for bytes") 822 if errors is not None: 823 raise TypeError("quote() doesn't support 'errors' for bytes") 824 return quote_from_bytes(string, safe) 825 826def quote_plus(string, safe='', encoding=None, errors=None): 827 """Like quote(), but also replace ' ' with '+', as required for quoting 828 HTML form values. Plus signs in the original string are escaped unless 829 they are included in safe. It also does not have safe default to '/'. 830 """ 831 # Check if ' ' in string, where string may either be a str or bytes. If 832 # there are no spaces, the regular quote will produce the right answer. 833 if ((isinstance(string, str) and ' ' not in string) or 834 (isinstance(string, bytes) and b' ' not in string)): 835 return quote(string, safe, encoding, errors) 836 if isinstance(safe, str): 837 space = ' ' 838 else: 839 space = b' ' 840 string = quote(string, safe + space, encoding, errors) 841 return string.replace(' ', '+') 842 843def quote_from_bytes(bs, safe='/'): 844 """Like quote(), but accepts a bytes object rather than a str, and does 845 not perform string-to-bytes encoding. It always returns an ASCII string. 846 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 847 """ 848 if not isinstance(bs, (bytes, bytearray)): 849 raise TypeError("quote_from_bytes() expected bytes") 850 if not bs: 851 return '' 852 if isinstance(safe, str): 853 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 854 safe = safe.encode('ascii', 'ignore') 855 else: 856 safe = bytes([c for c in safe if c < 128]) 857 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 858 return bs.decode() 859 try: 860 quoter = _safe_quoters[safe] 861 except KeyError: 862 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 863 return ''.join([quoter(char) for char in bs]) 864 865def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 866 quote_via=quote_plus): 867 """Encode a dict or sequence of two-element tuples into a URL query string. 868 869 If any values in the query arg are sequences and doseq is true, each 870 sequence element is converted to a separate parameter. 871 872 If the query arg is a sequence of two-element tuples, the order of the 873 parameters in the output will match the order of parameters in the 874 input. 875 876 The components of a query arg may each be either a string or a bytes type. 877 878 The safe, encoding, and errors parameters are passed down to the function 879 specified by quote_via (encoding and errors only if a component is a str). 880 """ 881 882 if hasattr(query, "items"): 883 query = query.items() 884 else: 885 # It's a bother at times that strings and string-like objects are 886 # sequences. 887 try: 888 # non-sequence items should not work with len() 889 # non-empty strings will fail this 890 if len(query) and not isinstance(query[0], tuple): 891 raise TypeError 892 # Zero-length sequences of all types will get here and succeed, 893 # but that's a minor nit. Since the original implementation 894 # allowed empty dicts that type of behavior probably should be 895 # preserved for consistency 896 except TypeError: 897 ty, va, tb = sys.exc_info() 898 raise TypeError("not a valid non-string sequence " 899 "or mapping object").with_traceback(tb) 900 901 l = [] 902 if not doseq: 903 for k, v in query: 904 if isinstance(k, bytes): 905 k = quote_via(k, safe) 906 else: 907 k = quote_via(str(k), safe, encoding, errors) 908 909 if isinstance(v, bytes): 910 v = quote_via(v, safe) 911 else: 912 v = quote_via(str(v), safe, encoding, errors) 913 l.append(k + '=' + v) 914 else: 915 for k, v in query: 916 if isinstance(k, bytes): 917 k = quote_via(k, safe) 918 else: 919 k = quote_via(str(k), safe, encoding, errors) 920 921 if isinstance(v, bytes): 922 v = quote_via(v, safe) 923 l.append(k + '=' + v) 924 elif isinstance(v, str): 925 v = quote_via(v, safe, encoding, errors) 926 l.append(k + '=' + v) 927 else: 928 try: 929 # Is this a sufficient test for sequence-ness? 930 x = len(v) 931 except TypeError: 932 # not a sequence 933 v = quote_via(str(v), safe, encoding, errors) 934 l.append(k + '=' + v) 935 else: 936 # loop over the sequence 937 for elt in v: 938 if isinstance(elt, bytes): 939 elt = quote_via(elt, safe) 940 else: 941 elt = quote_via(str(elt), safe, encoding, errors) 942 l.append(k + '=' + elt) 943 return '&'.join(l) 944 945def to_bytes(url): 946 """to_bytes(u"URL") --> 'URL'.""" 947 # Most URL schemes require ASCII. If that changes, the conversion 948 # can be relaxed. 949 # XXX get rid of to_bytes() 950 if isinstance(url, str): 951 try: 952 url = url.encode("ASCII").decode() 953 except UnicodeError: 954 raise UnicodeError("URL " + repr(url) + 955 " contains non-ASCII characters") 956 return url 957 958def unwrap(url): 959 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 960 url = str(url).strip() 961 if url[:1] == '<' and url[-1:] == '>': 962 url = url[1:-1].strip() 963 if url[:4] == 'URL:': url = url[4:].strip() 964 return url 965 966_typeprog = None 967def splittype(url): 968 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 969 global _typeprog 970 if _typeprog is None: 971 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 972 973 match = _typeprog.match(url) 974 if match: 975 scheme, data = match.groups() 976 return scheme.lower(), data 977 return None, url 978 979_hostprog = None 980def splithost(url): 981 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 982 global _hostprog 983 if _hostprog is None: 984 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) 985 986 match = _hostprog.match(url) 987 if match: 988 host_port, path = match.groups() 989 if path and path[0] != '/': 990 path = '/' + path 991 return host_port, path 992 return None, url 993 994def splituser(host): 995 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 996 user, delim, host = host.rpartition('@') 997 return (user if delim else None), host 998 999def splitpasswd(user): 1000 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 1001 user, delim, passwd = user.partition(':') 1002 return user, (passwd if delim else None) 1003 1004# splittag('/path#tag') --> '/path', 'tag' 1005_portprog = None 1006def splitport(host): 1007 """splitport('host:port') --> 'host', 'port'.""" 1008 global _portprog 1009 if _portprog is None: 1010 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) 1011 1012 match = _portprog.match(host) 1013 if match: 1014 host, port = match.groups() 1015 if port: 1016 return host, port 1017 return host, None 1018 1019def splitnport(host, defport=-1): 1020 """Split host and port, returning numeric port. 1021 Return given default port if no ':' found; defaults to -1. 1022 Return numerical port if a valid number are found after ':'. 1023 Return None if ':' but not a valid number.""" 1024 host, delim, port = host.rpartition(':') 1025 if not delim: 1026 host = port 1027 elif port: 1028 try: 1029 nport = int(port) 1030 except ValueError: 1031 nport = None 1032 return host, nport 1033 return host, defport 1034 1035def splitquery(url): 1036 """splitquery('/path?query') --> '/path', 'query'.""" 1037 path, delim, query = url.rpartition('?') 1038 if delim: 1039 return path, query 1040 return url, None 1041 1042def splittag(url): 1043 """splittag('/path#tag') --> '/path', 'tag'.""" 1044 path, delim, tag = url.rpartition('#') 1045 if delim: 1046 return path, tag 1047 return url, None 1048 1049def splitattr(url): 1050 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1051 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1052 words = url.split(';') 1053 return words[0], words[1:] 1054 1055def splitvalue(attr): 1056 """splitvalue('attr=value') --> 'attr', 'value'.""" 1057 attr, delim, value = attr.partition('=') 1058 return attr, (value if delim else None) 1059