1"""Parse (absolute and relative) URLs. 2 3urlparse module is based upon the following RFC specifications. 4 5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6and L. Masinter, January 2005. 7 8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9and L.Masinter, December 1999. 10 11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998. 15 16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 171995. 18 19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20McCahill, December 1994 21 22RFC 3986 is considered the current standard and any future changes to 23urlparse module should conform with it. The urlparse module is 24currently not entirely compliant with this RFC due to defacto 25scenarios for parsing, and for backward compatibility purposes, some 26parsing quirks from older RFCs are retained. The testcases in 27test_urlparse.py provides a good indicator of parsing behavior. 28 29""" 30 31__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", 32 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] 33 34# A classification of schemes ('' means apply by default) 35uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 36 'wais', 'file', 'https', 'shttp', 'mms', 37 'prospero', 'rtsp', 'rtspu', '', 'sftp'] 38uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 39 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 40 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 41 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh'] 42non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 43 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 44uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 45 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 46 'mms', '', 'sftp'] 47uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 48 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] 49uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 50 'nntp', 'wais', 'https', 'shttp', 'snews', 51 'file', 'prospero', ''] 52 53# Characters valid in scheme names 54scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 55 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 56 '0123456789' 57 '+-.') 58 59MAX_CACHE_SIZE = 20 60_parse_cache = {} 61 62def clear_cache(): 63 """Clear the parse cache.""" 64 _parse_cache.clear() 65 66 67class ResultMixin(object): 68 """Shared methods for the parsed result objects.""" 69 70 @property 71 def username(self): 72 netloc = self.netloc 73 if "@" in netloc: 74 userinfo = netloc.rsplit("@", 1)[0] 75 if ":" in userinfo: 76 userinfo = userinfo.split(":", 1)[0] 77 return userinfo 78 return None 79 80 @property 81 def password(self): 82 netloc = self.netloc 83 if "@" in netloc: 84 userinfo = netloc.rsplit("@", 1)[0] 85 if ":" in userinfo: 86 return userinfo.split(":", 1)[1] 87 return None 88 89 @property 90 def hostname(self): 91 netloc = self.netloc.split('@')[-1] 92 if '[' in netloc and ']' in netloc: 93 return netloc.split(']')[0][1:].lower() 94 elif ':' in netloc: 95 return netloc.split(':')[0].lower() 96 elif netloc == '': 97 return None 98 else: 99 return netloc.lower() 100 101 @property 102 def port(self): 103 netloc = self.netloc.split('@')[-1].split(']')[-1] 104 if ':' in netloc: 105 port = netloc.split(':')[1] 106 return int(port, 10) 107 else: 108 return None 109 110from collections import namedtuple 111 112class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): 113 114 __slots__ = () 115 116 def geturl(self): 117 return urlunsplit(self) 118 119 120class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): 121 122 __slots__ = () 123 124 def geturl(self): 125 return urlunparse(self) 126 127 128def urlparse(url, scheme='', allow_fragments=True): 129 """Parse a URL into 6 components: 130 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> 131 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). 132 Note that we don't break the components up in smaller bits 133 (e.g. netloc is a single string) and we don't expand % escapes.""" 134 tuple = urlsplit(url, scheme, allow_fragments) 135 scheme, netloc, url, query, fragment = tuple 136 if scheme in uses_params and ';' in url: 137 url, params = _splitparams(url) 138 else: 139 params = '' 140 return ParseResult(scheme, netloc, url, params, query, fragment) 141 142def _splitparams(url): 143 if '/' in url: 144 i = url.find(';', url.rfind('/')) 145 if i < 0: 146 return url, '' 147 else: 148 i = url.find(';') 149 return url[:i], url[i+1:] 150 151def _splitnetloc(url, start=0): 152 delim = len(url) # position of end of domain part of url, default is end 153 for c in '/?#': # look for delimiters; the order is NOT important 154 wdelim = url.find(c, start) # find first of this delim 155 if wdelim >= 0: # if found 156 delim = min(delim, wdelim) # use earliest delim position 157 return url[start:delim], url[delim:] # return (domain, rest) 158 159def urlsplit(url, scheme='', allow_fragments=True): 160 """Parse a URL into 5 components: 161 <scheme>://<netloc>/<path>?<query>#<fragment> 162 Return a 5-tuple: (scheme, netloc, path, query, fragment). 163 Note that we don't break the components up in smaller bits 164 (e.g. netloc is a single string) and we don't expand % escapes.""" 165 allow_fragments = bool(allow_fragments) 166 key = url, scheme, allow_fragments, type(url), type(scheme) 167 cached = _parse_cache.get(key, None) 168 if cached: 169 return cached 170 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth 171 clear_cache() 172 netloc = query = fragment = '' 173 i = url.find(':') 174 if i > 0: 175 if url[:i] == 'http': # optimize the common case 176 scheme = url[:i].lower() 177 url = url[i+1:] 178 if url[:2] == '//': 179 netloc, url = _splitnetloc(url, 2) 180 if (('[' in netloc and ']' not in netloc) or 181 (']' in netloc and '[' not in netloc)): 182 raise ValueError("Invalid IPv6 URL") 183 if allow_fragments and '#' in url: 184 url, fragment = url.split('#', 1) 185 if '?' in url: 186 url, query = url.split('?', 1) 187 v = SplitResult(scheme, netloc, url, query, fragment) 188 _parse_cache[key] = v 189 return v 190 for c in url[:i]: 191 if c not in scheme_chars: 192 break 193 else: 194 try: 195 # make sure "url" is not actually a port number (in which case 196 # "scheme" is really part of the path 197 _testportnum = int(url[i+1:]) 198 except ValueError: 199 scheme, url = url[:i].lower(), url[i+1:] 200 201 if url[:2] == '//': 202 netloc, url = _splitnetloc(url, 2) 203 if (('[' in netloc and ']' not in netloc) or 204 (']' in netloc and '[' not in netloc)): 205 raise ValueError("Invalid IPv6 URL") 206 if allow_fragments and scheme in uses_fragment and '#' in url: 207 url, fragment = url.split('#', 1) 208 if scheme in uses_query and '?' in url: 209 url, query = url.split('?', 1) 210 v = SplitResult(scheme, netloc, url, query, fragment) 211 _parse_cache[key] = v 212 return v 213 214def urlunparse(data): 215 """Put a parsed URL back together again. This may result in a 216 slightly different, but equivalent URL, if the URL that was parsed 217 originally had redundant delimiters, e.g. a ? with an empty query 218 (the draft states that these are equivalent).""" 219 scheme, netloc, url, params, query, fragment = data 220 if params: 221 url = "%s;%s" % (url, params) 222 return urlunsplit((scheme, netloc, url, query, fragment)) 223 224def urlunsplit(data): 225 """Combine the elements of a tuple as returned by urlsplit() into a 226 complete URL as a string. The data argument can be any five-item iterable. 227 This may result in a slightly different, but equivalent URL, if the URL that 228 was parsed originally had unnecessary delimiters (for example, a ? with an 229 empty query; the RFC states that these are equivalent).""" 230 scheme, netloc, url, query, fragment = data 231 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): 232 if url and url[:1] != '/': url = '/' + url 233 url = '//' + (netloc or '') + url 234 if scheme: 235 url = scheme + ':' + url 236 if query: 237 url = url + '?' + query 238 if fragment: 239 url = url + '#' + fragment 240 return url 241 242def urljoin(base, url, allow_fragments=True): 243 """Join a base URL and a possibly relative URL to form an absolute 244 interpretation of the latter.""" 245 if not base: 246 return url 247 if not url: 248 return base 249 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ 250 urlparse(base, '', allow_fragments) 251 scheme, netloc, path, params, query, fragment = \ 252 urlparse(url, bscheme, allow_fragments) 253 if scheme != bscheme or scheme not in uses_relative: 254 return url 255 if scheme in uses_netloc: 256 if netloc: 257 return urlunparse((scheme, netloc, path, 258 params, query, fragment)) 259 netloc = bnetloc 260 if path[:1] == '/': 261 return urlunparse((scheme, netloc, path, 262 params, query, fragment)) 263 if not path and not params: 264 path = bpath 265 params = bparams 266 if not query: 267 query = bquery 268 return urlunparse((scheme, netloc, path, 269 params, query, fragment)) 270 segments = bpath.split('/')[:-1] + path.split('/') 271 # XXX The stuff below is bogus in various ways... 272 if segments[-1] == '.': 273 segments[-1] = '' 274 while '.' in segments: 275 segments.remove('.') 276 while 1: 277 i = 1 278 n = len(segments) - 1 279 while i < n: 280 if (segments[i] == '..' 281 and segments[i-1] not in ('', '..')): 282 del segments[i-1:i+1] 283 break 284 i = i+1 285 else: 286 break 287 if segments == ['', '..']: 288 segments[-1] = '' 289 elif len(segments) >= 2 and segments[-1] == '..': 290 segments[-2:] = [''] 291 return urlunparse((scheme, netloc, '/'.join(segments), 292 params, query, fragment)) 293 294def urldefrag(url): 295 """Removes any existing fragment from URL. 296 297 Returns a tuple of the defragmented URL and the fragment. If 298 the URL contained no fragments, the second element is the 299 empty string. 300 """ 301 if '#' in url: 302 s, n, p, a, q, frag = urlparse(url) 303 defrag = urlunparse((s, n, p, a, q, '')) 304 return defrag, frag 305 else: 306 return url, '' 307 308# unquote method for parse_qs and parse_qsl 309# Cannot use directly from urllib as it would create a circular reference 310# because urllib uses urlparse methods (urljoin). If you update this function, 311# update it also in urllib. This code duplication does not existin in Python3. 312 313_hexdig = '0123456789ABCDEFabcdef' 314_hextochr = dict((a+b, chr(int(a+b,16))) 315 for a in _hexdig for b in _hexdig) 316 317def unquote(s): 318 """unquote('abc%20def') -> 'abc def'.""" 319 res = s.split('%') 320 # fastpath 321 if len(res) == 1: 322 return s 323 s = res[0] 324 for item in res[1:]: 325 try: 326 s += _hextochr[item[:2]] + item[2:] 327 except KeyError: 328 s += '%' + item 329 except UnicodeDecodeError: 330 s += unichr(int(item[:2], 16)) + item[2:] 331 return s 332 333def parse_qs(qs, keep_blank_values=0, strict_parsing=0): 334 """Parse a query given as a string argument. 335 336 Arguments: 337 338 qs: percent-encoded query string to be parsed 339 340 keep_blank_values: flag indicating whether blank values in 341 percent-encoded queries should be treated as blank strings. 342 A true value indicates that blanks should be retained as 343 blank strings. The default false value indicates that 344 blank values are to be ignored and treated as if they were 345 not included. 346 347 strict_parsing: flag indicating what to do with parsing errors. 348 If false (the default), errors are silently ignored. 349 If true, errors raise a ValueError exception. 350 """ 351 dict = {} 352 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing): 353 if name in dict: 354 dict[name].append(value) 355 else: 356 dict[name] = [value] 357 return dict 358 359def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): 360 """Parse a query given as a string argument. 361 362 Arguments: 363 364 qs: percent-encoded query string to be parsed 365 366 keep_blank_values: flag indicating whether blank values in 367 percent-encoded queries should be treated as blank strings. A 368 true value indicates that blanks should be retained as blank 369 strings. The default false value indicates that blank values 370 are to be ignored and treated as if they were not included. 371 372 strict_parsing: flag indicating what to do with parsing errors. If 373 false (the default), errors are silently ignored. If true, 374 errors raise a ValueError exception. 375 376 Returns a list, as G-d intended. 377 """ 378 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] 379 r = [] 380 for name_value in pairs: 381 if not name_value and not strict_parsing: 382 continue 383 nv = name_value.split('=', 1) 384 if len(nv) != 2: 385 if strict_parsing: 386 raise ValueError, "bad query field: %r" % (name_value,) 387 # Handle case of a control-name with no equal sign 388 if keep_blank_values: 389 nv.append('') 390 else: 391 continue 392 if len(nv[1]) or keep_blank_values: 393 name = unquote(nv[0].replace('+', ' ')) 394 value = unquote(nv[1].replace('+', ' ')) 395 r.append((name, value)) 396 397 return r 398