1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28""" 29 30import re 31import sys 32import collections 33 34__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 35 "urlsplit", "urlunsplit", "urlencode", "parse_qs", 36 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", 37 "unquote", "unquote_plus", "unquote_to_bytes", 38 "DefragResult", "ParseResult", "SplitResult", 39 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] 40 41# A classification of schemes ('' means apply by default) 42uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 43 'wais', 'file', 'https', 'shttp', 'mms', 44 'prospero', 'rtsp', 'rtspu', '', 'sftp', 45 'svn', 'svn+ssh', 'ws', 'wss'] 46uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 47 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 48 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 49 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh', 50 'ws', 'wss'] 51uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 52 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 53 'mms', '', 'sftp', 'tel'] 54 55# These are not actually used anymore, but should stay for backwards 56# compatibility. (They are undocumented, but have a public-looking name.) 57non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 58 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 59uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 60 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] 61uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 62 'nntp', 'wais', 'https', 'shttp', 'snews', 63 'file', 'prospero', ''] 64 65# Characters valid in scheme names 66scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 67 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 68 '0123456789' 69 '+-.') 70 71# XXX: Consider replacing with functools.lru_cache 72MAX_CACHE_SIZE = 20 73_parse_cache = {} 74 75def clear_cache(): 76 """Clear the parse cache and the quoters cache.""" 77 _parse_cache.clear() 78 _safe_quoters.clear() 79 80 81# Helpers for bytes handling 82# For 3.2, we deliberately require applications that 83# handle improperly quoted URLs to do their own 84# decoding and encoding. If valid use cases are 85# presented, we may relax this by using latin-1 86# decoding internally for 3.3 87_implicit_encoding = 'ascii' 88_implicit_errors = 'strict' 89 90def _noop(obj): 91 return obj 92 93def _encode_result(obj, encoding=_implicit_encoding, 94 errors=_implicit_errors): 95 return obj.encode(encoding, errors) 96 97def _decode_args(args, encoding=_implicit_encoding, 98 errors=_implicit_errors): 99 return tuple(x.decode(encoding, errors) if x else '' for x in args) 100 101def _coerce_args(*args): 102 # Invokes decode if necessary to create str args 103 # and returns the coerced inputs along with 104 # an appropriate result coercion function 105 # - noop for str inputs 106 # - encoding function otherwise 107 str_input = isinstance(args[0], str) 108 for arg in args[1:]: 109 # We special-case the empty string to support the 110 # "scheme=''" default argument to some functions 111 if arg and isinstance(arg, str) != str_input: 112 raise TypeError("Cannot mix str and non-str arguments") 113 if str_input: 114 return args + (_noop,) 115 return _decode_args(args) + (_encode_result,) 116 117# Result objects are more helpful than simple tuples 118class _ResultMixinStr(object): 119 """Standard approach to encoding parsed results from str to bytes""" 120 __slots__ = () 121 122 def encode(self, encoding='ascii', errors='strict'): 123 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) 124 125 126class _ResultMixinBytes(object): 127 """Standard approach to decoding parsed results from bytes to str""" 128 __slots__ = () 129 130 def decode(self, encoding='ascii', errors='strict'): 131 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) 132 133 134class _NetlocResultMixinBase(object): 135 """Shared methods for the parsed result objects containing a netloc element""" 136 __slots__ = () 137 138 @property 139 def username(self): 140 return self._userinfo[0] 141 142 @property 143 def password(self): 144 return self._userinfo[1] 145 146 @property 147 def hostname(self): 148 hostname = self._hostinfo[0] 149 if not hostname: 150 hostname = None 151 elif hostname is not None: 152 hostname = hostname.lower() 153 return hostname 154 155 @property 156 def port(self): 157 port = self._hostinfo[1] 158 if port is not None: 159 port = int(port, 10) 160 if not ( 0 <= port <= 65535): 161 raise ValueError("Port out of range 0-65535") 162 return port 163 164 165class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): 166 __slots__ = () 167 168 @property 169 def _userinfo(self): 170 netloc = self.netloc 171 userinfo, have_info, hostinfo = netloc.rpartition('@') 172 if have_info: 173 username, have_password, password = userinfo.partition(':') 174 if not have_password: 175 password = None 176 else: 177 username = password = None 178 return username, password 179 180 @property 181 def _hostinfo(self): 182 netloc = self.netloc 183 _, _, hostinfo = netloc.rpartition('@') 184 _, have_open_br, bracketed = hostinfo.partition('[') 185 if have_open_br: 186 hostname, _, port = bracketed.partition(']') 187 _, _, port = port.partition(':') 188 else: 189 hostname, _, port = hostinfo.partition(':') 190 if not port: 191 port = None 192 return hostname, port 193 194 195class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): 196 __slots__ = () 197 198 @property 199 def _userinfo(self): 200 netloc = self.netloc 201 userinfo, have_info, hostinfo = netloc.rpartition(b'@') 202 if have_info: 203 username, have_password, password = userinfo.partition(b':') 204 if not have_password: 205 password = None 206 else: 207 username = password = None 208 return username, password 209 210 @property 211 def _hostinfo(self): 212 netloc = self.netloc 213 _, _, hostinfo = netloc.rpartition(b'@') 214 _, have_open_br, bracketed = hostinfo.partition(b'[') 215 if have_open_br: 216 hostname, _, port = bracketed.partition(b']') 217 _, _, port = port.partition(b':') 218 else: 219 hostname, _, port = hostinfo.partition(b':') 220 if not port: 221 port = None 222 return hostname, port 223 224 225from collections import namedtuple 226 227_DefragResultBase = namedtuple('DefragResult', 'url fragment') 228_SplitResultBase = namedtuple( 229 'SplitResult', 'scheme netloc path query fragment') 230_ParseResultBase = namedtuple( 231 'ParseResult', 'scheme netloc path params query fragment') 232 233_DefragResultBase.__doc__ = """ 234DefragResult(url, fragment) 235 236A 2-tuple that contains the url without fragment identifier and the fragment 237identifier as a separate argument. 238""" 239 240_DefragResultBase.url.__doc__ = """The URL with no fragment identifier.""" 241 242_DefragResultBase.fragment.__doc__ = """ 243Fragment identifier separated from URL, that allows indirect identification of a 244secondary resource by reference to a primary resource and additional identifying 245information. 246""" 247 248_SplitResultBase.__doc__ = """ 249SplitResult(scheme, netloc, path, query, fragment) 250 251A 5-tuple that contains the different components of a URL. Similar to 252ParseResult, but does not split params. 253""" 254 255_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request.""" 256 257_SplitResultBase.netloc.__doc__ = """ 258Network location where the request is made to. 259""" 260 261_SplitResultBase.path.__doc__ = """ 262The hierarchical path, such as the path to a file to download. 263""" 264 265_SplitResultBase.query.__doc__ = """ 266The query component, that contains non-hierarchical data, that along with data 267in path component, identifies a resource in the scope of URI's scheme and 268network location. 269""" 270 271_SplitResultBase.fragment.__doc__ = """ 272Fragment identifier, that allows indirect identification of a secondary resource 273by reference to a primary resource and additional identifying information. 274""" 275 276_ParseResultBase.__doc__ = """ 277ParseResult(scheme, netloc, path, params, query, fragment) 278 279A 6-tuple that contains components of a parsed URL. 280""" 281 282_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__ 283_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__ 284_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__ 285_ParseResultBase.params.__doc__ = """ 286Parameters for last path element used to dereference the URI in order to provide 287access to perform some operation on the resource. 288""" 289 290_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__ 291_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__ 292 293 294# For backwards compatibility, alias _NetlocResultMixinStr 295# ResultBase is no longer part of the documented API, but it is 296# retained since deprecating it isn't worth the hassle 297ResultBase = _NetlocResultMixinStr 298 299# Structured result objects for string data 300class DefragResult(_DefragResultBase, _ResultMixinStr): 301 __slots__ = () 302 def geturl(self): 303 if self.fragment: 304 return self.url + '#' + self.fragment 305 else: 306 return self.url 307 308class SplitResult(_SplitResultBase, _NetlocResultMixinStr): 309 __slots__ = () 310 def geturl(self): 311 return urlunsplit(self) 312 313class ParseResult(_ParseResultBase, _NetlocResultMixinStr): 314 __slots__ = () 315 def geturl(self): 316 return urlunparse(self) 317 318# Structured result objects for bytes data 319class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): 320 __slots__ = () 321 def geturl(self): 322 if self.fragment: 323 return self.url + b'#' + self.fragment 324 else: 325 return self.url 326 327class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): 328 __slots__ = () 329 def geturl(self): 330 return urlunsplit(self) 331 332class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): 333 __slots__ = () 334 def geturl(self): 335 return urlunparse(self) 336 337# Set up the encode/decode result pairs 338def _fix_result_transcoding(): 339 _result_pairs = ( 340 (DefragResult, DefragResultBytes), 341 (SplitResult, SplitResultBytes), 342 (ParseResult, ParseResultBytes), 343 ) 344 for _decoded, _encoded in _result_pairs: 345 _decoded._encoded_counterpart = _encoded 346 _encoded._decoded_counterpart = _decoded 347 348_fix_result_transcoding() 349del _fix_result_transcoding 350 351def urlparse(url, scheme='', allow_fragments=True): 352 """Parse a URL into 6 components: 353 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 354 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 355 Note that we don't break the components up in smaller bits 356 (e.g. netloc is a single string) and we don't expand % escapes.""" 357 url, scheme, _coerce_result = _coerce_args(url, scheme) 358 splitresult = urlsplit(url, scheme, allow_fragments) 359 scheme, netloc, url, query, fragment = splitresult 360 if scheme in uses_params and ';' in url: 361 url, params = _splitparams(url) 362 else: 363 params = '' 364 result = ParseResult(scheme, netloc, url, params, query, fragment) 365 return _coerce_result(result) 366 367def _splitparams(url): 368 if '/' in url: 369 i = url.find(';', url.rfind('/')) 370 if i < 0: 371 return url, '' 372 else: 373 i = url.find(';') 374 return url[:i], url[i+1:] 375 376def _splitnetloc(url, start=0): 377 delim = len(url) # position of end of domain part of url, default is end 378 for c in '/?#': # look for delimiters; the order is NOT important 379 wdelim = url.find(c, start) # find first of this delim 380 if wdelim >= 0: # if found 381 delim = min(delim, wdelim) # use earliest delim position 382 return url[start:delim], url[delim:] # return (domain, rest) 383 384def urlsplit(url, scheme='', allow_fragments=True): 385 """Parse a URL into 5 components: 386 <scheme>://<netloc>/<path>?<query>#<fragment> 387 Return a 5-tuple: (scheme, netloc, path, query, fragment). 388 Note that we don't break the components up in smaller bits 389 (e.g. netloc is a single string) and we don't expand % escapes.""" 390 url, scheme, _coerce_result = _coerce_args(url, scheme) 391 allow_fragments = bool(allow_fragments) 392 key = url, scheme, allow_fragments, type(url), type(scheme) 393 cached = _parse_cache.get(key, None) 394 if cached: 395 return _coerce_result(cached) 396 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 397 clear_cache() 398 netloc = query = fragment = '' 399 i = url.find(':') 400 if i > 0: 401 if url[:i] == 'http': # optimize the common case 402 scheme = url[:i].lower() 403 url = url[i+1:] 404 if url[:2] == '//': 405 netloc, url = _splitnetloc(url, 2) 406 if (('[' in netloc and ']' not in netloc) or 407 (']' in netloc and '[' not in netloc)): 408 raise ValueError("Invalid IPv6 URL") 409 if allow_fragments and '#' in url: 410 url, fragment = url.split('#', 1) 411 if '?' in url: 412 url, query = url.split('?', 1) 413 v = SplitResult(scheme, netloc, url, query, fragment) 414 _parse_cache[key] = v 415 return _coerce_result(v) 416 for c in url[:i]: 417 if c not in scheme_chars: 418 break 419 else: 420 # make sure "url" is not actually a port number (in which case 421 # "scheme" is really part of the path) 422 rest = url[i+1:] 423 if not rest or any(c not in '0123456789' for c in rest): 424 # not a port number 425 scheme, url = url[:i].lower(), rest 426 427 if url[:2] == '//': 428 netloc, url = _splitnetloc(url, 2) 429 if (('[' in netloc and ']' not in netloc) or 430 (']' in netloc and '[' not in netloc)): 431 raise ValueError("Invalid IPv6 URL") 432 if allow_fragments and '#' in url: 433 url, fragment = url.split('#', 1) 434 if '?' in url: 435 url, query = url.split('?', 1) 436 v = SplitResult(scheme, netloc, url, query, fragment) 437 _parse_cache[key] = v 438 return _coerce_result(v) 439 440def urlunparse(components): 441 """Put a parsed URL back together again. This may result in a 442 slightly different, but equivalent URL, if the URL that was parsed 443 originally had redundant delimiters, e.g. a ? with an empty query 444 (the draft states that these are equivalent).""" 445 scheme, netloc, url, params, query, fragment, _coerce_result = ( 446 _coerce_args(*components)) 447 if params: 448 url = "%s;%s" % (url, params) 449 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) 450 451def urlunsplit(components): 452 """Combine the elements of a tuple as returned by urlsplit() into a 453 complete URL as a string. The data argument can be any five-item iterable. 454 This may result in a slightly different, but equivalent URL, if the URL that 455 was parsed originally had unnecessary delimiters (for example, a ? with an 456 empty query; the RFC states that these are equivalent).""" 457 scheme, netloc, url, query, fragment, _coerce_result = ( 458 _coerce_args(*components)) 459 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 460 if url and url[:1] != '/': url = '/' + url 461 url = '//' + (netloc or '') + url 462 if scheme: 463 url = scheme + ':' + url 464 if query: 465 url = url + '?' + query 466 if fragment: 467 url = url + '#' + fragment 468 return _coerce_result(url) 469 470def urljoin(base, url, allow_fragments=True): 471 """Join a base URL and a possibly relative URL to form an absolute 472 interpretation of the latter.""" 473 if not base: 474 return url 475 if not url: 476 return base 477 478 base, url, _coerce_result = _coerce_args(base, url) 479 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 480 urlparse(base, '', allow_fragments) 481 scheme, netloc, path, params, query, fragment = \ 482 urlparse(url, bscheme, allow_fragments) 483 484 if scheme != bscheme or scheme not in uses_relative: 485 return _coerce_result(url) 486 if scheme in uses_netloc: 487 if netloc: 488 return _coerce_result(urlunparse((scheme, netloc, path, 489 params, query, fragment))) 490 netloc = bnetloc 491 492 if not path and not params: 493 path = bpath 494 params = bparams 495 if not query: 496 query = bquery 497 return _coerce_result(urlunparse((scheme, netloc, path, 498 params, query, fragment))) 499 500 base_parts = bpath.split('/') 501 if base_parts[-1] != '': 502 # the last item is not a directory, so will not be taken into account 503 # in resolving the relative path 504 del base_parts[-1] 505 506 # for rfc3986, ignore all base path should the first character be root. 507 if path[:1] == '/': 508 segments = path.split('/') 509 else: 510 segments = base_parts + path.split('/') 511 # filter out elements that would cause redundant slashes on re-joining 512 # the resolved_path 513 segments[1:-1] = filter(None, segments[1:-1]) 514 515 resolved_path = [] 516 517 for seg in segments: 518 if seg == '..': 519 try: 520 resolved_path.pop() 521 except IndexError: 522 # ignore any .. segments that would otherwise cause an IndexError 523 # when popped from resolved_path if resolving for rfc3986 524 pass 525 elif seg == '.': 526 continue 527 else: 528 resolved_path.append(seg) 529 530 if segments[-1] in ('.', '..'): 531 # do some post-processing here. if the last segment was a relative dir, 532 # then we need to append the trailing '/' 533 resolved_path.append('') 534 535 return _coerce_result(urlunparse((scheme, netloc, '/'.join( 536 resolved_path) or '/', params, query, fragment))) 537 538 539def urldefrag(url): 540 """Removes any existing fragment from URL. 541 542 Returns a tuple of the defragmented URL and the fragment. If 543 the URL contained no fragments, the second element is the 544 empty string. 545 """ 546 url, _coerce_result = _coerce_args(url) 547 if '#' in url: 548 s, n, p, a, q, frag = urlparse(url) 549 defrag = urlunparse((s, n, p, a, q, '')) 550 else: 551 frag = '' 552 defrag = url 553 return _coerce_result(DefragResult(defrag, frag)) 554 555_hexdig = '0123456789ABCDEFabcdef' 556_hextobyte = None 557 558def unquote_to_bytes(string): 559 """unquote_to_bytes('abc%20def') -> b'abc def'.""" 560 # Note: strings are encoded as UTF-8. This is only an issue if it contains 561 # unescaped non-ASCII characters, which URIs should not. 562 if not string: 563 # Is it a string-like object? 564 string.split 565 return b'' 566 if isinstance(string, str): 567 string = string.encode('utf-8') 568 bits = string.split(b'%') 569 if len(bits) == 1: 570 return string 571 res = [bits[0]] 572 append = res.append 573 # Delay the initialization of the table to not waste memory 574 # if the function is never called 575 global _hextobyte 576 if _hextobyte is None: 577 _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)]) 578 for a in _hexdig for b in _hexdig} 579 for item in bits[1:]: 580 try: 581 append(_hextobyte[item[:2]]) 582 append(item[2:]) 583 except KeyError: 584 append(b'%') 585 append(item) 586 return b''.join(res) 587 588_asciire = re.compile('([\x00-\x7f]+)') 589 590def unquote(string, encoding='utf-8', errors='replace'): 591 """Replace %xx escapes by their single-character equivalent. The optional 592 encoding and errors parameters specify how to decode percent-encoded 593 sequences into Unicode characters, as accepted by the bytes.decode() 594 method. 595 By default, percent-encoded sequences are decoded with UTF-8, and invalid 596 sequences are replaced by a placeholder character. 597 598 unquote('abc%20def') -> 'abc def'. 599 """ 600 if '%' not in string: 601 string.split 602 return string 603 if encoding is None: 604 encoding = 'utf-8' 605 if errors is None: 606 errors = 'replace' 607 bits = _asciire.split(string) 608 res = [bits[0]] 609 append = res.append 610 for i in range(1, len(bits), 2): 611 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) 612 append(bits[i + 1]) 613 return ''.join(res) 614 615def parse_qs(qs, keep_blank_values=False, strict_parsing=False, 616 encoding='utf-8', errors='replace'): 617 """Parse a query given as a string argument. 618 619 Arguments: 620 621 qs: percent-encoded query string to be parsed 622 623 keep_blank_values: flag indicating whether blank values in 624 percent-encoded queries should be treated as blank strings. 625 A true value indicates that blanks should be retained as 626 blank strings. The default false value indicates that 627 blank values are to be ignored and treated as if they were 628 not included. 629 630 strict_parsing: flag indicating what to do with parsing errors. 631 If false (the default), errors are silently ignored. 632 If true, errors raise a ValueError exception. 633 634 encoding and errors: specify how to decode percent-encoded sequences 635 into Unicode characters, as accepted by the bytes.decode() method. 636 """ 637 parsed_result = {} 638 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, 639 encoding=encoding, errors=errors) 640 for name, value in pairs: 641 if name in parsed_result: 642 parsed_result[name].append(value) 643 else: 644 parsed_result[name] = [value] 645 return parsed_result 646 647def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, 648 encoding='utf-8', errors='replace'): 649 """Parse a query given as a string argument. 650 651 Arguments: 652 653 qs: percent-encoded query string to be parsed 654 655 keep_blank_values: flag indicating whether blank values in 656 percent-encoded queries should be treated as blank strings. A 657 true value indicates that blanks should be retained as blank 658 strings. The default false value indicates that blank values 659 are to be ignored and treated as if they were not included. 660 661 strict_parsing: flag indicating what to do with parsing errors. If 662 false (the default), errors are silently ignored. If true, 663 errors raise a ValueError exception. 664 665 encoding and errors: specify how to decode percent-encoded sequences 666 into Unicode characters, as accepted by the bytes.decode() method. 667 668 Returns a list, as G-d intended. 669 """ 670 qs, _coerce_result = _coerce_args(qs) 671 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 672 r = [] 673 for name_value in pairs: 674 if not name_value and not strict_parsing: 675 continue 676 nv = name_value.split('=', 1) 677 if len(nv) != 2: 678 if strict_parsing: 679 raise ValueError("bad query field: %r" % (name_value,)) 680 # Handle case of a control-name with no equal sign 681 if keep_blank_values: 682 nv.append('') 683 else: 684 continue 685 if len(nv[1]) or keep_blank_values: 686 name = nv[0].replace('+', ' ') 687 name = unquote(name, encoding=encoding, errors=errors) 688 name = _coerce_result(name) 689 value = nv[1].replace('+', ' ') 690 value = unquote(value, encoding=encoding, errors=errors) 691 value = _coerce_result(value) 692 r.append((name, value)) 693 return r 694 695def unquote_plus(string, encoding='utf-8', errors='replace'): 696 """Like unquote(), but also replace plus signs by spaces, as required for 697 unquoting HTML form values. 698 699 unquote_plus('%7e/abc+def') -> '~/abc def' 700 """ 701 string = string.replace('+', ' ') 702 return unquote(string, encoding, errors) 703 704_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 705 b'abcdefghijklmnopqrstuvwxyz' 706 b'0123456789' 707 b'_.-') 708_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) 709_safe_quoters = {} 710 711class Quoter(collections.defaultdict): 712 """A mapping from bytes (in range(0,256)) to strings. 713 714 String values are percent-encoded byte values, unless the key < 128, and 715 in the "safe" set (either the specified safe set, or default set). 716 """ 717 # Keeps a cache internally, using defaultdict, for efficiency (lookups 718 # of cached keys don't call Python code at all). 719 def __init__(self, safe): 720 """safe: bytes object.""" 721 self.safe = _ALWAYS_SAFE.union(safe) 722 723 def __repr__(self): 724 # Without this, will just display as a defaultdict 725 return "<%s %r>" % (self.__class__.__name__, dict(self)) 726 727 def __missing__(self, b): 728 # Handle a cache miss. Store quoted string in cache and return. 729 res = chr(b) if b in self.safe else '%{:02X}'.format(b) 730 self[b] = res 731 return res 732 733def quote(string, safe='/', encoding=None, errors=None): 734 """quote('abc def') -> 'abc%20def' 735 736 Each part of a URL, e.g. the path info, the query, etc., has a 737 different set of reserved characters that must be quoted. 738 739 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists 740 the following reserved characters. 741 742 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 743 "$" | "," 744 745 Each of these characters is reserved in some component of a URL, 746 but not necessarily in all of them. 747 748 By default, the quote function is intended for quoting the path 749 section of a URL. Thus, it will not encode '/'. This character 750 is reserved, but in typical usage the quote function is being 751 called on a path where the existing slash characters are used as 752 reserved characters. 753 754 string and safe may be either str or bytes objects. encoding and errors 755 must not be specified if string is a bytes object. 756 757 The optional encoding and errors parameters specify how to deal with 758 non-ASCII characters, as accepted by the str.encode method. 759 By default, encoding='utf-8' (characters are encoded with UTF-8), and 760 errors='strict' (unsupported characters raise a UnicodeEncodeError). 761 """ 762 if isinstance(string, str): 763 if not string: 764 return string 765 if encoding is None: 766 encoding = 'utf-8' 767 if errors is None: 768 errors = 'strict' 769 string = string.encode(encoding, errors) 770 else: 771 if encoding is not None: 772 raise TypeError("quote() doesn't support 'encoding' for bytes") 773 if errors is not None: 774 raise TypeError("quote() doesn't support 'errors' for bytes") 775 return quote_from_bytes(string, safe) 776 777def quote_plus(string, safe='', encoding=None, errors=None): 778 """Like quote(), but also replace ' ' with '+', as required for quoting 779 HTML form values. Plus signs in the original string are escaped unless 780 they are included in safe. It also does not have safe default to '/'. 781 """ 782 # Check if ' ' in string, where string may either be a str or bytes. If 783 # there are no spaces, the regular quote will produce the right answer. 784 if ((isinstance(string, str) and ' ' not in string) or 785 (isinstance(string, bytes) and b' ' not in string)): 786 return quote(string, safe, encoding, errors) 787 if isinstance(safe, str): 788 space = ' ' 789 else: 790 space = b' ' 791 string = quote(string, safe + space, encoding, errors) 792 return string.replace(' ', '+') 793 794def quote_from_bytes(bs, safe='/'): 795 """Like quote(), but accepts a bytes object rather than a str, and does 796 not perform string-to-bytes encoding. It always returns an ASCII string. 797 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' 798 """ 799 if not isinstance(bs, (bytes, bytearray)): 800 raise TypeError("quote_from_bytes() expected bytes") 801 if not bs: 802 return '' 803 if isinstance(safe, str): 804 # Normalize 'safe' by converting to bytes and removing non-ASCII chars 805 safe = safe.encode('ascii', 'ignore') 806 else: 807 safe = bytes([c for c in safe if c < 128]) 808 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): 809 return bs.decode() 810 try: 811 quoter = _safe_quoters[safe] 812 except KeyError: 813 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ 814 return ''.join([quoter(char) for char in bs]) 815 816def urlencode(query, doseq=False, safe='', encoding=None, errors=None, 817 quote_via=quote_plus): 818 """Encode a dict or sequence of two-element tuples into a URL query string. 819 820 If any values in the query arg are sequences and doseq is true, each 821 sequence element is converted to a separate parameter. 822 823 If the query arg is a sequence of two-element tuples, the order of the 824 parameters in the output will match the order of parameters in the 825 input. 826 827 The components of a query arg may each be either a string or a bytes type. 828 829 The safe, encoding, and errors parameters are passed down to the function 830 specified by quote_via (encoding and errors only if a component is a str). 831 """ 832 833 if hasattr(query, "items"): 834 query = query.items() 835 else: 836 # It's a bother at times that strings and string-like objects are 837 # sequences. 838 try: 839 # non-sequence items should not work with len() 840 # non-empty strings will fail this 841 if len(query) and not isinstance(query[0], tuple): 842 raise TypeError 843 # Zero-length sequences of all types will get here and succeed, 844 # but that's a minor nit. Since the original implementation 845 # allowed empty dicts that type of behavior probably should be 846 # preserved for consistency 847 except TypeError: 848 ty, va, tb = sys.exc_info() 849 raise TypeError("not a valid non-string sequence " 850 "or mapping object").with_traceback(tb) 851 852 l = [] 853 if not doseq: 854 for k, v in query: 855 if isinstance(k, bytes): 856 k = quote_via(k, safe) 857 else: 858 k = quote_via(str(k), safe, encoding, errors) 859 860 if isinstance(v, bytes): 861 v = quote_via(v, safe) 862 else: 863 v = quote_via(str(v), safe, encoding, errors) 864 l.append(k + '=' + v) 865 else: 866 for k, v in query: 867 if isinstance(k, bytes): 868 k = quote_via(k, safe) 869 else: 870 k = quote_via(str(k), safe, encoding, errors) 871 872 if isinstance(v, bytes): 873 v = quote_via(v, safe) 874 l.append(k + '=' + v) 875 elif isinstance(v, str): 876 v = quote_via(v, safe, encoding, errors) 877 l.append(k + '=' + v) 878 else: 879 try: 880 # Is this a sufficient test for sequence-ness? 881 x = len(v) 882 except TypeError: 883 # not a sequence 884 v = quote_via(str(v), safe, encoding, errors) 885 l.append(k + '=' + v) 886 else: 887 # loop over the sequence 888 for elt in v: 889 if isinstance(elt, bytes): 890 elt = quote_via(elt, safe) 891 else: 892 elt = quote_via(str(elt), safe, encoding, errors) 893 l.append(k + '=' + elt) 894 return '&'.join(l) 895 896def to_bytes(url): 897 """to_bytes(u"URL") --> 'URL'.""" 898 # Most URL schemes require ASCII. If that changes, the conversion 899 # can be relaxed. 900 # XXX get rid of to_bytes() 901 if isinstance(url, str): 902 try: 903 url = url.encode("ASCII").decode() 904 except UnicodeError: 905 raise UnicodeError("URL " + repr(url) + 906 " contains non-ASCII characters") 907 return url 908 909def unwrap(url): 910 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" 911 url = str(url).strip() 912 if url[:1] == '<' and url[-1:] == '>': 913 url = url[1:-1].strip() 914 if url[:4] == 'URL:': url = url[4:].strip() 915 return url 916 917_typeprog = None 918def splittype(url): 919 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" 920 global _typeprog 921 if _typeprog is None: 922 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) 923 924 match = _typeprog.match(url) 925 if match: 926 scheme, data = match.groups() 927 return scheme.lower(), data 928 return None, url 929 930_hostprog = None 931def splithost(url): 932 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" 933 global _hostprog 934 if _hostprog is None: 935 _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL) 936 937 match = _hostprog.match(url) 938 if match: 939 host_port, path = match.groups() 940 if path and path[0] != '/': 941 path = '/' + path 942 return host_port, path 943 return None, url 944 945def splituser(host): 946 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" 947 user, delim, host = host.rpartition('@') 948 return (user if delim else None), host 949 950def splitpasswd(user): 951 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" 952 user, delim, passwd = user.partition(':') 953 return user, (passwd if delim else None) 954 955# splittag('/path#tag') --> '/path', 'tag' 956_portprog = None 957def splitport(host): 958 """splitport('host:port') --> 'host', 'port'.""" 959 global _portprog 960 if _portprog is None: 961 _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) 962 963 match = _portprog.match(host) 964 if match: 965 host, port = match.groups() 966 if port: 967 return host, port 968 return host, None 969 970def splitnport(host, defport=-1): 971 """Split host and port, returning numeric port. 972 Return given default port if no ':' found; defaults to -1. 973 Return numerical port if a valid number are found after ':'. 974 Return None if ':' but not a valid number.""" 975 host, delim, port = host.rpartition(':') 976 if not delim: 977 host = port 978 elif port: 979 try: 980 nport = int(port) 981 except ValueError: 982 nport = None 983 return host, nport 984 return host, defport 985 986def splitquery(url): 987 """splitquery('/path?query') --> '/path', 'query'.""" 988 path, delim, query = url.rpartition('?') 989 if delim: 990 return path, query 991 return url, None 992 993def splittag(url): 994 """splittag('/path#tag') --> '/path', 'tag'.""" 995 path, delim, tag = url.rpartition('#') 996 if delim: 997 return path, tag 998 return url, None 999 1000def splitattr(url): 1001 """splitattr('/path;attr1=value1;attr2=value2;...') -> 1002 '/path', ['attr1=value1', 'attr2=value2', ...].""" 1003 words = url.split(';') 1004 return words[0], words[1:] 1005 1006def splitvalue(attr): 1007 """splitvalue('attr=value') --> 'attr', 'value'.""" 1008 attr, delim, value = attr.partition('=') 1009 return attr, (value if delim else None) 1010