Lines Matching +full:url +full:- +full:parse
1 """Parse (absolute and relative) URLs.
5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
29 The WHATWG URL Parser spec should also be considered. We are not compliant with
69 # compatibility. (They are undocumented, but have a public-looking name.)
85 '+-.')
103 # presented, we may relax this by using latin-1
123 # - noop for str inputs
124 # - encoding function otherwise
127 # We special-case the empty string to support the
130 raise TypeError("Cannot mix str and non-str arguments")
184 raise ValueError("Port out of range 0-65535")
250 _DefragResultBase = namedtuple('DefragResult', 'url fragment')
257 DefragResult(url, fragment)
259 A 2-tuple that contains the url without fragment identifier and the fragment
263 _DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
266 Fragment identifier separated from URL, that allows indirect identification of a
274 A 5-tuple that contains the different components of a URL. Similar to
278 _SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
289 The query component, that contains non-hierarchical data, that along with data
302 A 6-tuple that contains components of a parsed URL.
327 return self.url + '#' + self.fragment
329 return self.url
346 return self.url + b'#' + self.fragment
348 return self.url
374 def urlparse(url, scheme='', allow_fragments=True): argument
375 """Parse a URL into 6 components:
378 The result is a named 6-tuple with fields corresponding to the
380 depending on the type of the url parameter.
382 The username, password, hostname, and port sub-components of netloc
386 component when no scheme is found in url.
394 url, scheme, _coerce_result = _coerce_args(url, scheme)
395 splitresult = urlsplit(url, scheme, allow_fragments)
396 scheme, netloc, url, query, fragment = splitresult
397 if scheme in uses_params and ';' in url:
398 url, params = _splitparams(url)
401 result = ParseResult(scheme, netloc, url, params, query, fragment)
404 def _splitparams(url): argument
405 if '/' in url:
406 i = url.find(';', url.rfind('/'))
408 return url, ''
410 i = url.find(';')
411 return url[:i], url[i+1:]
413 def _splitnetloc(url, start=0): argument
414 delim = len(url) # position of end of domain part of url, default is end
416 wdelim = url.find(c, start) # find first of this delim
419 return url[start:delim], url[delim:] # return (domain, rest)
440 # https://www.rfc-editor.org/rfc/rfc3986#page-49 and https://url.spec.whatwg.org/
443 if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", hostname):
453 def urlsplit(url, scheme='', allow_fragments=True): argument
454 """Parse a URL into 5 components:
457 The result is a named 5-tuple with fields corresponding to the
459 depending on the type of the url parameter.
461 The username, password, hostname, and port sub-components of netloc
465 component when no scheme is found in url.
474 url, scheme, _coerce_result = _coerce_args(url, scheme)
475 # Only lstrip url as some applications rely on preserving trailing space.
476 # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
477 url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
481 url = url.replace(b, "")
486 i = url.find(':')
487 if i > 0 and url[0].isascii() and url[0].isalpha():
488 for c in url[:i]:
492 scheme, url = url[:i].lower(), url[i+1:]
493 if url[:2] == '//':
494 netloc, url = _splitnetloc(url, 2)
497 raise ValueError("Invalid IPv6 URL")
501 if allow_fragments and '#' in url:
502 url, fragment = url.split('#', 1)
503 if '?' in url:
504 url, query = url.split('?', 1)
506 v = SplitResult(scheme, netloc, url, query, fragment)
510 """Put a parsed URL back together again. This may result in a
511 slightly different, but equivalent URL, if the URL that was parsed
514 scheme, netloc, url, params, query, fragment, _coerce_result = (
517 url = "%s;%s" % (url, params)
518 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
522 complete URL as a string. The data argument can be any five-item iterable.
523 This may result in a slightly different, but equivalent URL, if the URL that
526 scheme, netloc, url, query, fragment, _coerce_result = (
528 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
529 if url and url[:1] != '/': url = '/' + url
530 url = '//' + (netloc or '') + url
532 url = scheme + ':' + url
534 url = url + '?' + query
536 url = url + '#' + fragment
537 return _coerce_result(url)
539 def urljoin(base, url, allow_fragments=True): argument
540 """Join a base URL and a possibly relative URL to form an absolute
543 return url
544 if not url:
547 base, url, _coerce_result = _coerce_args(base, url)
551 urlparse(url, bscheme, allow_fragments)
554 return _coerce_result(url)
570 if base_parts[-1] != '':
573 del base_parts[-1]
580 # filter out elements that would cause redundant slashes on re-joining
582 segments[1:-1] = filter(None, segments[1:-1])
599 if segments[-1] in ('.', '..'):
600 # do some post-processing here. if the last segment was a relative dir,
608 def urldefrag(url): argument
609 """Removes any existing fragment from URL.
611 Returns a tuple of the defragmented URL and the fragment. If
612 the URL contained no fragments, the second element is the
615 url, _coerce_result = _coerce_args(url)
616 if '#' in url:
617 s, n, p, a, q, frag = urlparse(url)
621 defrag = url
628 """unquote_to_bytes('abc%20def') -> b'abc def'."""
629 # Note: strings are encoded as UTF-8. This is only an issue if it contains
630 # unescaped non-ASCII characters, which URIs should not.
632 # Is it a string-like object?
636 string = string.encode('utf-8')
657 _asciire = re.compile('([\x00-\x7f]+)')
659 def unquote(string, encoding='utf-8', errors='replace'):
660 """Replace %xx escapes by their single-character equivalent. The optional
661 encoding and errors parameters specify how to decode percent-encoded
664 By default, percent-encoded sequences are decoded with UTF-8, and invalid
667 unquote('abc%20def') -> 'abc def'.
675 encoding = 'utf-8'
688 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
689 """Parse a query given as a string argument.
693 qs: percent-encoded query string to be parsed
696 percent-encoded queries should be treated as blank strings.
706 encoding and errors: specify how to decode percent-encoded sequences
730 encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
731 """Parse a query given as a string argument.
735 qs: percent-encoded query string to be parsed
738 percent-encoded queries should be treated as blank strings.
747 encoding and errors: specify how to decode percent-encoded sequences
756 Returns a list, as G-d intended.
781 # Handle case of a control-name with no equal sign
796 def unquote_plus(string, encoding='utf-8', errors='replace'):
800 unquote_plus('%7e/abc+def') -> '~/abc def'
808 b'_.-~')
814 'urllib.parse.Quoter will be removed in Python 3.14. '
823 String values are percent-encoded byte values, unless the key < 128, and
842 """quote('abc def') -> 'abc%20def'
844 Each part of a URL, e.g. the path info, the query, etc., has a
852 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
853 reserved = gen-delims / sub-delims
854 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
855 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
858 Each of the reserved characters is reserved in some component of a URL,
861 The quote function %-escapes all characters that are neither in the
869 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
876 non-ASCII characters, as accepted by the str.encode method.
877 By default, encoding='utf-8' (characters are encoded with UTF-8), and
884 encoding = 'utf-8'
919 not perform string-to-bytes encoding. It always returns an ASCII string.
920 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
927 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
939 """Encode a dict or sequence of two-element tuples into a URL query string.
944 If the query arg is a sequence of two-element tuples, the order of the
957 # It's a bother at times that strings and string-like objects are
960 # non-sequence items should not work with len()
961 # non-empty strings will fail this
964 # Zero-length sequences of all types will get here and succeed,
969 raise TypeError("not a valid non-string sequence "
1000 # Is this a sufficient test for sequence-ness?
1017 def to_bytes(url): argument
1018 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
1020 return _to_bytes(url)
1023 def _to_bytes(url): argument
1024 """to_bytes(u"URL") --> 'URL'."""
1025 # Most URL schemes require ASCII. If that changes, the conversion
1028 if isinstance(url, str):
1030 url = url.encode("ASCII").decode()
1032 raise UnicodeError("URL " + repr(url) +
1033 " contains non-ASCII characters")
1034 return url
1037 def unwrap(url): argument
1038 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
1040 The string is returned unchanged if it's not a wrapped URL.
1042 url = str(url).strip()
1043 if url[:1] == '<' and url[-1:] == '>':
1044 url = url[1:-1].strip()
1045 if url[:4] == 'URL:':
1046 url = url[4:].strip()
1047 return url
1050 def splittype(url): argument
1051 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1052 "use urllib.parse.urlparse() instead",
1054 return _splittype(url)
1058 def _splittype(url): argument
1059 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1064 match = _typeprog.match(url)
1068 return None, url
1071 def splithost(url): argument
1072 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1073 "use urllib.parse.urlparse() instead",
1075 return _splithost(url)
1079 def _splithost(url): argument
1080 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1085 match = _hostprog.match(url)
1091 return None, url
1095 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1096 "use urllib.parse.urlparse() instead",
1102 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1108 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1109 "use urllib.parse.urlparse() instead",
1115 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1121 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1122 "use urllib.parse.urlparse() instead",
1127 # splittag('/path#tag') --> '/path', 'tag'
1130 """splitport('host:port') --> 'host', 'port'."""
1133 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1143 def splitnport(host, defport=-1):
1144 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1145 "use urllib.parse.urlparse() instead",
1150 def _splitnport(host, defport=-1):
1152 Return given default port if no ':' found; defaults to -1.
1167 def splitquery(url): argument
1168 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1169 "use urllib.parse.urlparse() instead",
1171 return _splitquery(url)
1174 def _splitquery(url): argument
1175 """splitquery('/path?query') --> '/path', 'query'."""
1176 path, delim, query = url.rpartition('?')
1179 return url, None
1182 def splittag(url): argument
1183 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1184 "use urllib.parse.urlparse() instead",
1186 return _splittag(url)
1189 def _splittag(url): argument
1190 """splittag('/path#tag') --> '/path', 'tag'."""
1191 path, delim, tag = url.rpartition('#')
1194 return url, None
1197 def splitattr(url): argument
1198 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1199 "use urllib.parse.urlparse() instead",
1201 return _splitattr(url)
1204 def _splitattr(url): argument
1205 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1207 words = url.split(';')
1212 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1213 "use urllib.parse.parse_qsl() instead",
1219 """splitvalue('attr=value') --> 'attr', 'value'."""