1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28 29""" 30 31import re 32 33__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 34 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] 35 36# A classification of schemes ('' means apply by default) 37uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 38 'wais', 'file', 'https', 'shttp', 'mms', 39 'prospero', 'rtsp', 'rtspu', '', 'sftp', 40 'svn', 'svn+ssh'] 41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 42 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 44 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh'] 45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 47 'mms', '', 'sftp', 'tel'] 48 49# These are not actually used anymore, but should stay for backwards 50# compatibility. (They are undocumented, but have a public-looking name.) 51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 52 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 54 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] 55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 56 'nntp', 'wais', 'https', 'shttp', 'snews', 57 'file', 'prospero', ''] 58 59# Characters valid in scheme names 60scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 61 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 62 '0123456789' 63 '+-.') 64 65MAX_CACHE_SIZE = 20 66_parse_cache = {} 67 68def clear_cache(): 69 """Clear the parse cache.""" 70 _parse_cache.clear() 71 72 73class ResultMixin(object): 74 """Shared methods for the parsed result objects.""" 75 76 @property 77 def username(self): 78 netloc = self.netloc 79 if "@" in netloc: 80 userinfo = netloc.rsplit("@", 1)[0] 81 if ":" in userinfo: 82 userinfo = userinfo.split(":", 1)[0] 83 return userinfo 84 return None 85 86 @property 87 def password(self): 88 netloc = self.netloc 89 if "@" in netloc: 90 userinfo = netloc.rsplit("@", 1)[0] 91 if ":" in userinfo: 92 return userinfo.split(":", 1)[1] 93 return None 94 95 @property 96 def hostname(self): 97 netloc = self.netloc.split('@')[-1] 98 if '[' in netloc and ']' in netloc: 99 return netloc.split(']')[0][1:].lower() 100 elif ':' in netloc: 101 return netloc.split(':')[0].lower() 102 elif netloc == '': 103 return None 104 else: 105 return netloc.lower() 106 107 @property 108 def port(self): 109 netloc = self.netloc.split('@')[-1].split(']')[-1] 110 if ':' in netloc: 111 port = netloc.split(':')[1] 112 if port: 113 port = int(port, 10) 114 # verify legal port 115 if (0 <= port <= 65535): 116 return port 117 return None 118 119from collections import namedtuple 120 121class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): 122 123 __slots__ = () 124 125 def geturl(self): 126 return urlunsplit(self) 127 128 129class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): 130 131 __slots__ = () 132 133 def geturl(self): 134 return urlunparse(self) 135 136 137def urlparse(url, scheme='', allow_fragments=True): 138 """Parse a URL into 6 components: 139 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 140 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 141 Note that we don't break the components up in smaller bits 142 (e.g. netloc is a single string) and we don't expand % escapes.""" 143 tuple = urlsplit(url, scheme, allow_fragments) 144 scheme, netloc, url, query, fragment = tuple 145 if scheme in uses_params and ';' in url: 146 url, params = _splitparams(url) 147 else: 148 params = '' 149 return ParseResult(scheme, netloc, url, params, query, fragment) 150 151def _splitparams(url): 152 if '/' in url: 153 i = url.find(';', url.rfind('/')) 154 if i < 0: 155 return url, '' 156 else: 157 i = url.find(';') 158 return url[:i], url[i+1:] 159 160def _splitnetloc(url, start=0): 161 delim = len(url) # position of end of domain part of url, default is end 162 for c in '/?#': # look for delimiters; the order is NOT important 163 wdelim = url.find(c, start) # find first of this delim 164 if wdelim >= 0: # if found 165 delim = min(delim, wdelim) # use earliest delim position 166 return url[start:delim], url[delim:] # return (domain, rest) 167 168def urlsplit(url, scheme='', allow_fragments=True): 169 """Parse a URL into 5 components: 170 <scheme>://<netloc>/<path>?<query>#<fragment> 171 Return a 5-tuple: (scheme, netloc, path, query, fragment). 172 Note that we don't break the components up in smaller bits 173 (e.g. netloc is a single string) and we don't expand % escapes.""" 174 allow_fragments = bool(allow_fragments) 175 key = url, scheme, allow_fragments, type(url), type(scheme) 176 cached = _parse_cache.get(key, None) 177 if cached: 178 return cached 179 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 180 clear_cache() 181 netloc = query = fragment = '' 182 i = url.find(':') 183 if i > 0: 184 if url[:i] == 'http': # optimize the common case 185 scheme = url[:i].lower() 186 url = url[i+1:] 187 if url[:2] == '//': 188 netloc, url = _splitnetloc(url, 2) 189 if (('[' in netloc and ']' not in netloc) or 190 (']' in netloc and '[' not in netloc)): 191 raise ValueError("Invalid IPv6 URL") 192 if allow_fragments and '#' in url: 193 url, fragment = url.split('#', 1) 194 if '?' in url: 195 url, query = url.split('?', 1) 196 v = SplitResult(scheme, netloc, url, query, fragment) 197 _parse_cache[key] = v 198 return v 199 for c in url[:i]: 200 if c not in scheme_chars: 201 break 202 else: 203 # make sure "url" is not actually a port number (in which case 204 # "scheme" is really part of the path) 205 rest = url[i+1:] 206 if not rest or any(c not in '0123456789' for c in rest): 207 # not a port number 208 scheme, url = url[:i].lower(), rest 209 210 if url[:2] == '//': 211 netloc, url = _splitnetloc(url, 2) 212 if (('[' in netloc and ']' not in netloc) or 213 (']' in netloc and '[' not in netloc)): 214 raise ValueError("Invalid IPv6 URL") 215 if allow_fragments and '#' in url: 216 url, fragment = url.split('#', 1) 217 if '?' in url: 218 url, query = url.split('?', 1) 219 v = SplitResult(scheme, netloc, url, query, fragment) 220 _parse_cache[key] = v 221 return v 222 223def urlunparse(data): 224 """Put a parsed URL back together again. This may result in a 225 slightly different, but equivalent URL, if the URL that was parsed 226 originally had redundant delimiters, e.g. a ? with an empty query 227 (the draft states that these are equivalent).""" 228 scheme, netloc, url, params, query, fragment = data 229 if params: 230 url = "%s;%s" % (url, params) 231 return urlunsplit((scheme, netloc, url, query, fragment)) 232 233def urlunsplit(data): 234 """Combine the elements of a tuple as returned by urlsplit() into a 235 complete URL as a string. The data argument can be any five-item iterable. 236 This may result in a slightly different, but equivalent URL, if the URL that 237 was parsed originally had unnecessary delimiters (for example, a ? with an 238 empty query; the RFC states that these are equivalent).""" 239 scheme, netloc, url, query, fragment = data 240 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 241 if url and url[:1] != '/': url = '/' + url 242 url = '//' + (netloc or '') + url 243 if scheme: 244 url = scheme + ':' + url 245 if query: 246 url = url + '?' + query 247 if fragment: 248 url = url + '#' + fragment 249 return url 250 251def urljoin(base, url, allow_fragments=True): 252 """Join a base URL and a possibly relative URL to form an absolute 253 interpretation of the latter.""" 254 if not base: 255 return url 256 if not url: 257 return base 258 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 259 urlparse(base, '', allow_fragments) 260 scheme, netloc, path, params, query, fragment = \ 261 urlparse(url, bscheme, allow_fragments) 262 if scheme != bscheme or scheme not in uses_relative: 263 return url 264 if scheme in uses_netloc: 265 if netloc: 266 return urlunparse((scheme, netloc, path, 267 params, query, fragment)) 268 netloc = bnetloc 269 if path[:1] == '/': 270 return urlunparse((scheme, netloc, path, 271 params, query, fragment)) 272 if not path and not params: 273 path = bpath 274 params = bparams 275 if not query: 276 query = bquery 277 return urlunparse((scheme, netloc, path, 278 params, query, fragment)) 279 segments = bpath.split('/')[:-1] + path.split('/') 280 # XXX The stuff below is bogus in various ways... 281 if segments[-1] == '.': 282 segments[-1] = '' 283 while '.' in segments: 284 segments.remove('.') 285 while 1: 286 i = 1 287 n = len(segments) - 1 288 while i < n: 289 if (segments[i] == '..' 290 and segments[i-1] not in ('', '..')): 291 del segments[i-1:i+1] 292 break 293 i = i+1 294 else: 295 break 296 if segments == ['', '..']: 297 segments[-1] = '' 298 elif len(segments) >= 2 and segments[-1] == '..': 299 segments[-2:] = [''] 300 return urlunparse((scheme, netloc, '/'.join(segments), 301 params, query, fragment)) 302 303def urldefrag(url): 304 """Removes any existing fragment from URL. 305 306 Returns a tuple of the defragmented URL and the fragment. If 307 the URL contained no fragments, the second element is the 308 empty string. 309 """ 310 if '#' in url: 311 s, n, p, a, q, frag = urlparse(url) 312 defrag = urlunparse((s, n, p, a, q, '')) 313 return defrag, frag 314 else: 315 return url, '' 316 317try: 318 unicode 319except NameError: 320 def _is_unicode(x): 321 return 0 322else: 323 def _is_unicode(x): 324 return isinstance(x, unicode) 325 326# unquote method for parse_qs and parse_qsl 327# Cannot use directly from urllib as it would create a circular reference 328# because urllib uses urlparse methods (urljoin). If you update this function, 329# update it also in urllib. This code duplication does not existin in Python3. 330 331_hexdig = '0123456789ABCDEFabcdef' 332_hextochr = dict((a+b, chr(int(a+b,16))) 333 for a in _hexdig for b in _hexdig) 334_asciire = re.compile('([\x00-\x7f]+)') 335 336def unquote(s): 337 """unquote('abc%20def') -> 'abc def'.""" 338 if _is_unicode(s): 339 if '%' not in s: 340 return s 341 bits = _asciire.split(s) 342 res = [bits[0]] 343 append = res.append 344 for i in range(1, len(bits), 2): 345 append(unquote(str(bits[i])).decode('latin1')) 346 append(bits[i + 1]) 347 return ''.join(res) 348 349 bits = s.split('%') 350 # fastpath 351 if len(bits) == 1: 352 return s 353 res = [bits[0]] 354 append = res.append 355 for item in bits[1:]: 356 try: 357 append(_hextochr[item[:2]]) 358 append(item[2:]) 359 except KeyError: 360 append('%') 361 append(item) 362 return ''.join(res) 363 364def parse_qs(qs, keep_blank_values=0, strict_parsing=0): 365 """Parse a query given as a string argument. 366 367 Arguments: 368 369 qs: percent-encoded query string to be parsed 370 371 keep_blank_values: flag indicating whether blank values in 372 percent-encoded queries should be treated as blank strings. 373 A true value indicates that blanks should be retained as 374 blank strings. The default false value indicates that 375 blank values are to be ignored and treated as if they were 376 not included. 377 378 strict_parsing: flag indicating what to do with parsing errors. 379 If false (the default), errors are silently ignored. 380 If true, errors raise a ValueError exception. 381 """ 382 dict = {} 383 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing): 384 if name in dict: 385 dict[name].append(value) 386 else: 387 dict[name] = [value] 388 return dict 389 390def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): 391 """Parse a query given as a string argument. 392 393 Arguments: 394 395 qs: percent-encoded query string to be parsed 396 397 keep_blank_values: flag indicating whether blank values in 398 percent-encoded queries should be treated as blank strings. A 399 true value indicates that blanks should be retained as blank 400 strings. The default false value indicates that blank values 401 are to be ignored and treated as if they were not included. 402 403 strict_parsing: flag indicating what to do with parsing errors. If 404 false (the default), errors are silently ignored. If true, 405 errors raise a ValueError exception. 406 407 Returns a list, as G-d intended. 408 """ 409 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 410 r = [] 411 for name_value in pairs: 412 if not name_value and not strict_parsing: 413 continue 414 nv = name_value.split('=', 1) 415 if len(nv) != 2: 416 if strict_parsing: 417 raise ValueError, "bad query field: %r" % (name_value,) 418 # Handle case of a control-name with no equal sign 419 if keep_blank_values: 420 nv.append('') 421 else: 422 continue 423 if len(nv[1]) or keep_blank_values: 424 name = unquote(nv[0].replace('+', ' ')) 425 value = unquote(nv[1].replace('+', ' ')) 426 r.append((name, value)) 427 428 return r 429