parse.py - OpenGrok cross reference for /external/python/cpython3/Lib/urllib/parse.py

Lines Matching +full:url +full:- +full:parse
1 """Parse (absolute and relative) URLs.
5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
29 The WHATWG URL Parser spec should also be considered.  We are not compliant with
69 # compatibility.  (They are undocumented, but have a public-looking name.)
85                 '+-.')
103 # presented, we may relax this by using latin-1
123     #   - noop for str inputs
124     #   - encoding function otherwise
127         # We special-case the empty string to support the
130             raise TypeError("Cannot mix str and non-str arguments")
184                 raise ValueError("Port out of range 0-65535")
250 _DefragResultBase = namedtuple('DefragResult', 'url fragment')
257 DefragResult(url, fragment)
259 A 2-tuple that contains the url without fragment identifier and the fragment
263 _DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
266 Fragment identifier separated from URL, that allows indirect identification of a
274 A 5-tuple that contains the different components of a URL. Similar to
278 _SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
289 The query component, that contains non-hierarchical data, that along with data
302 A 6-tuple that contains components of a parsed URL.
327             return self.url + '#' + self.fragment
329             return self.url
346             return self.url + b'#' + self.fragment
348             return self.url
374 def urlparse(url, scheme='', allow_fragments=True):  argument
375     """Parse a URL into 6 components:
378     The result is a named 6-tuple with fields corresponding to the
380     depending on the type of the url parameter.
382     The username, password, hostname, and port sub-components of netloc
386     component when no scheme is found in url.
394     url, scheme, _coerce_result = _coerce_args(url, scheme)
395     splitresult = urlsplit(url, scheme, allow_fragments)
396     scheme, netloc, url, query, fragment = splitresult
397     if scheme in uses_params and ';' in url:
398         url, params = _splitparams(url)
401     result = ParseResult(scheme, netloc, url, params, query, fragment)
404 def _splitparams(url):  argument
405     if '/'  in url:
406         i = url.find(';', url.rfind('/'))
408             return url, ''
410         i = url.find(';')
411     return url[:i], url[i+1:]
413 def _splitnetloc(url, start=0):  argument
414     delim = len(url)   # position of end of domain part of url, default is end
416         wdelim = url.find(c, start)        # find first of this delim
419     return url[start:delim], url[delim:]   # return (domain, rest)
440 # https://www.rfc-editor.org/rfc/rfc3986#page-49 and https://url.spec.whatwg.org/
443         if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", hostname):
453 def urlsplit(url, scheme='', allow_fragments=True):  argument
454     """Parse a URL into 5 components:
457     The result is a named 5-tuple with fields corresponding to the
459     depending on the type of the url parameter.
461     The username, password, hostname, and port sub-components of netloc
465     component when no scheme is found in url.
474     url, scheme, _coerce_result = _coerce_args(url, scheme)
475     # Only lstrip url as some applications rely on preserving trailing space.
476     # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
477     url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
481         url = url.replace(b, "")
486     i = url.find(':')
487     if i > 0 and url[0].isascii() and url[0].isalpha():
488         for c in url[:i]:
492             scheme, url = url[:i].lower(), url[i+1:]
493     if url[:2] == '//':
494         netloc, url = _splitnetloc(url, 2)
497             raise ValueError("Invalid IPv6 URL")
501     if allow_fragments and '#' in url:
502         url, fragment = url.split('#', 1)
503     if '?' in url:
504         url, query = url.split('?', 1)
506     v = SplitResult(scheme, netloc, url, query, fragment)
510     """Put a parsed URL back together again.  This may result in a
511     slightly different, but equivalent URL, if the URL that was parsed
514     scheme, netloc, url, params, query, fragment, _coerce_result = (
517         url = "%s;%s" % (url, params)
518     return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
522     complete URL as a string. The data argument can be any five-item iterable.
523     This may result in a slightly different, but equivalent URL, if the URL that
526     scheme, netloc, url, query, fragment, _coerce_result = (
528     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
529         if url and url[:1] != '/': url = '/' + url
530         url = '//' + (netloc or '') + url
532         url = scheme + ':' + url
534         url = url + '?' + query
536         url = url + '#' + fragment
537     return _coerce_result(url)
539 def urljoin(base, url, allow_fragments=True):  argument
540     """Join a base URL and a possibly relative URL to form an absolute
543         return url
544     if not url:
547     base, url, _coerce_result = _coerce_args(base, url)
551             urlparse(url, bscheme, allow_fragments)
554         return _coerce_result(url)
570     if base_parts[-1] != '':
573         del base_parts[-1]
580         # filter out elements that would cause redundant slashes on re-joining
582         segments[1:-1] = filter(None, segments[1:-1])
599     if segments[-1] in ('.', '..'):
600         # do some post-processing here. if the last segment was a relative dir,
608 def urldefrag(url):  argument
609     """Removes any existing fragment from URL.
611     Returns a tuple of the defragmented URL and the fragment.  If
612     the URL contained no fragments, the second element is the
615     url, _coerce_result = _coerce_args(url)
616     if '#' in url:
617         s, n, p, a, q, frag = urlparse(url)
621         defrag = url
628     """unquote_to_bytes('abc%20def') -> b'abc def'."""
629     # Note: strings are encoded as UTF-8. This is only an issue if it contains
630     # unescaped non-ASCII characters, which URIs should not.
632         # Is it a string-like object?
636         string = string.encode('utf-8')
657 _asciire = re.compile('([\x00-\x7f]+)')
659 def unquote(string, encoding='utf-8', errors='replace'):
660     """Replace %xx escapes by their single-character equivalent. The optional
661     encoding and errors parameters specify how to decode percent-encoded
664     By default, percent-encoded sequences are decoded with UTF-8, and invalid
667     unquote('abc%20def') -> 'abc def'.
675         encoding = 'utf-8'
688              encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
689     """Parse a query given as a string argument.
693         qs: percent-encoded query string to be parsed
696             percent-encoded queries should be treated as blank strings.
706         encoding and errors: specify how to decode percent-encoded sequences
730               encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
731     """Parse a query given as a string argument.
735         qs: percent-encoded query string to be parsed
738             percent-encoded queries should be treated as blank strings.
747         encoding and errors: specify how to decode percent-encoded sequences
756         Returns a list, as G-d intended.
781             # Handle case of a control-name with no equal sign
796 def unquote_plus(string, encoding='utf-8', errors='replace'):
800     unquote_plus('%7e/abc+def') -> '~/abc def'
808                          b'_.-~')
814                       'urllib.parse.Quoter will be removed in Python 3.14. '
823     String values are percent-encoded byte values, unless the key < 128, and
842     """quote('abc def') -> 'abc%20def'
844     Each part of a URL, e.g. the path info, the query, etc., has a
852     unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
853     reserved      = gen-delims / sub-delims
854     gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
855     sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
858     Each of the reserved characters is reserved in some component of a URL,
861     The quote function %-escapes all characters that are neither in the
869     Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
876     non-ASCII characters, as accepted by the str.encode method.
877     By default, encoding='utf-8' (characters are encoded with UTF-8), and
884             encoding = 'utf-8'
919     not perform string-to-bytes encoding.  It always returns an ASCII string.
920     quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
927         # Normalize 'safe' by converting to bytes and removing non-ASCII chars
939     """Encode a dict or sequence of two-element tuples into a URL query string.
944     If the query arg is a sequence of two-element tuples, the order of the
957         # It's a bother at times that strings and string-like objects are
960             # non-sequence items should not work with len()
961             # non-empty strings will fail this
964             # Zero-length sequences of all types will get here and succeed,
969             raise TypeError("not a valid non-string sequence "
1000                     # Is this a sufficient test for sequence-ness?
1017 def to_bytes(url):  argument
1018     warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
1020     return _to_bytes(url)
1023 def _to_bytes(url):  argument
1024     """to_bytes(u"URL") --> 'URL'."""
1025     # Most URL schemes require ASCII. If that changes, the conversion
1028     if isinstance(url, str):
1030             url = url.encode("ASCII").decode()
1032             raise UnicodeError("URL " + repr(url) +
1033                                " contains non-ASCII characters")
1034     return url
1037 def unwrap(url):  argument
1038     """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
1040     The string is returned unchanged if it's not a wrapped URL.
1042     url = str(url).strip()
1043     if url[:1] == '<' and url[-1:] == '>':
1044         url = url[1:-1].strip()
1045     if url[:4] == 'URL:':
1046         url = url[4:].strip()
1047     return url
1050 def splittype(url):  argument
1051     warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1052                   "use urllib.parse.urlparse() instead",
1054     return _splittype(url)
1058 def _splittype(url):  argument
1059     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1064     match = _typeprog.match(url)
1068     return None, url
1071 def splithost(url):  argument
1072     warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1073                   "use urllib.parse.urlparse() instead",
1075     return _splithost(url)
1079 def _splithost(url):  argument
1080     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1085     match = _hostprog.match(url)
1091     return None, url
1095     warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1096                   "use urllib.parse.urlparse() instead",
1102     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1108     warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1109                   "use urllib.parse.urlparse() instead",
1115     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1121     warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1122                   "use urllib.parse.urlparse() instead",
1127 # splittag('/path#tag') --> '/path', 'tag'
1130     """splitport('host:port') --> 'host', 'port'."""
1133         _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1143 def splitnport(host, defport=-1):
1144     warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1145                   "use urllib.parse.urlparse() instead",
1150 def _splitnport(host, defport=-1):
1152     Return given default port if no ':' found; defaults to -1.
1167 def splitquery(url):  argument
1168     warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1169                   "use urllib.parse.urlparse() instead",
1171     return _splitquery(url)
1174 def _splitquery(url):  argument
1175     """splitquery('/path?query') --> '/path', 'query'."""
1176     path, delim, query = url.rpartition('?')
1179     return url, None
1182 def splittag(url):  argument
1183     warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1184                   "use urllib.parse.urlparse() instead",
1186     return _splittag(url)
1189 def _splittag(url):  argument
1190     """splittag('/path#tag') --> '/path', 'tag'."""
1191     path, delim, tag = url.rpartition('#')
1194     return url, None
1197 def splitattr(url):  argument
1198     warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1199                   "use urllib.parse.urlparse() instead",
1201     return _splitattr(url)
1204 def _splitattr(url):  argument
1205     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1207     words = url.split(';')
1212     warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1213                   "use urllib.parse.parse_qsl() instead",
1219     """splitvalue('attr=value') --> 'attr', 'value'."""