• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28
29"""
30
31__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
32           "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
33
34# A classification of schemes ('' means apply by default)
35uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
36                 'wais', 'file', 'https', 'shttp', 'mms',
37                 'prospero', 'rtsp', 'rtspu', '', 'sftp']
38uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
41               'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
42non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
43                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
45               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
46               'mms', '', 'sftp']
47uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
50                 'nntp', 'wais', 'https', 'shttp', 'snews',
51                 'file', 'prospero', '']
52
53# Characters valid in scheme names
54scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
56                '0123456789'
57                '+-.')
58
59MAX_CACHE_SIZE = 20
60_parse_cache = {}
61
62def clear_cache():
63    """Clear the parse cache."""
64    _parse_cache.clear()
65
66
67class ResultMixin(object):
68    """Shared methods for the parsed result objects."""
69
70    @property
71    def username(self):
72        netloc = self.netloc
73        if "@" in netloc:
74            userinfo = netloc.rsplit("@", 1)[0]
75            if ":" in userinfo:
76                userinfo = userinfo.split(":", 1)[0]
77            return userinfo
78        return None
79
80    @property
81    def password(self):
82        netloc = self.netloc
83        if "@" in netloc:
84            userinfo = netloc.rsplit("@", 1)[0]
85            if ":" in userinfo:
86                return userinfo.split(":", 1)[1]
87        return None
88
89    @property
90    def hostname(self):
91        netloc = self.netloc.split('@')[-1]
92        if '[' in netloc and ']' in netloc:
93            return netloc.split(']')[0][1:].lower()
94        elif ':' in netloc:
95            return netloc.split(':')[0].lower()
96        elif netloc == '':
97            return None
98        else:
99            return netloc.lower()
100
101    @property
102    def port(self):
103        netloc = self.netloc.split('@')[-1].split(']')[-1]
104        if ':' in netloc:
105            port = netloc.split(':')[1]
106            return int(port, 10)
107        else:
108            return None
109
110from collections import namedtuple
111
112class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
113
114    __slots__ = ()
115
116    def geturl(self):
117        return urlunsplit(self)
118
119
120class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
121
122    __slots__ = ()
123
124    def geturl(self):
125        return urlunparse(self)
126
127
128def urlparse(url, scheme='', allow_fragments=True):
129    """Parse a URL into 6 components:
130    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132    Note that we don't break the components up in smaller bits
133    (e.g. netloc is a single string) and we don't expand % escapes."""
134    tuple = urlsplit(url, scheme, allow_fragments)
135    scheme, netloc, url, query, fragment = tuple
136    if scheme in uses_params and ';' in url:
137        url, params = _splitparams(url)
138    else:
139        params = ''
140    return ParseResult(scheme, netloc, url, params, query, fragment)
141
142def _splitparams(url):
143    if '/'  in url:
144        i = url.find(';', url.rfind('/'))
145        if i < 0:
146            return url, ''
147    else:
148        i = url.find(';')
149    return url[:i], url[i+1:]
150
151def _splitnetloc(url, start=0):
152    delim = len(url)   # position of end of domain part of url, default is end
153    for c in '/?#':    # look for delimiters; the order is NOT important
154        wdelim = url.find(c, start)        # find first of this delim
155        if wdelim >= 0:                    # if found
156            delim = min(delim, wdelim)     # use earliest delim position
157    return url[start:delim], url[delim:]   # return (domain, rest)
158
159def urlsplit(url, scheme='', allow_fragments=True):
160    """Parse a URL into 5 components:
161    <scheme>://<netloc>/<path>?<query>#<fragment>
162    Return a 5-tuple: (scheme, netloc, path, query, fragment).
163    Note that we don't break the components up in smaller bits
164    (e.g. netloc is a single string) and we don't expand % escapes."""
165    allow_fragments = bool(allow_fragments)
166    key = url, scheme, allow_fragments, type(url), type(scheme)
167    cached = _parse_cache.get(key, None)
168    if cached:
169        return cached
170    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
171        clear_cache()
172    netloc = query = fragment = ''
173    i = url.find(':')
174    if i > 0:
175        if url[:i] == 'http': # optimize the common case
176            scheme = url[:i].lower()
177            url = url[i+1:]
178            if url[:2] == '//':
179                netloc, url = _splitnetloc(url, 2)
180                if (('[' in netloc and ']' not in netloc) or
181                        (']' in netloc and '[' not in netloc)):
182                    raise ValueError("Invalid IPv6 URL")
183            if allow_fragments and '#' in url:
184                url, fragment = url.split('#', 1)
185            if '?' in url:
186                url, query = url.split('?', 1)
187            v = SplitResult(scheme, netloc, url, query, fragment)
188            _parse_cache[key] = v
189            return v
190        for c in url[:i]:
191            if c not in scheme_chars:
192                break
193        else:
194            try:
195                # make sure "url" is not actually a port number (in which case
196                # "scheme" is really part of the path
197                _testportnum = int(url[i+1:])
198            except ValueError:
199                scheme, url = url[:i].lower(), url[i+1:]
200
201    if url[:2] == '//':
202        netloc, url = _splitnetloc(url, 2)
203        if (('[' in netloc and ']' not in netloc) or
204                (']' in netloc and '[' not in netloc)):
205            raise ValueError("Invalid IPv6 URL")
206    if allow_fragments and scheme in uses_fragment and '#' in url:
207        url, fragment = url.split('#', 1)
208    if scheme in uses_query and '?' in url:
209        url, query = url.split('?', 1)
210    v = SplitResult(scheme, netloc, url, query, fragment)
211    _parse_cache[key] = v
212    return v
213
214def urlunparse(data):
215    """Put a parsed URL back together again.  This may result in a
216    slightly different, but equivalent URL, if the URL that was parsed
217    originally had redundant delimiters, e.g. a ? with an empty query
218    (the draft states that these are equivalent)."""
219    scheme, netloc, url, params, query, fragment = data
220    if params:
221        url = "%s;%s" % (url, params)
222    return urlunsplit((scheme, netloc, url, query, fragment))
223
224def urlunsplit(data):
225    """Combine the elements of a tuple as returned by urlsplit() into a
226    complete URL as a string. The data argument can be any five-item iterable.
227    This may result in a slightly different, but equivalent URL, if the URL that
228    was parsed originally had unnecessary delimiters (for example, a ? with an
229    empty query; the RFC states that these are equivalent)."""
230    scheme, netloc, url, query, fragment = data
231    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
232        if url and url[:1] != '/': url = '/' + url
233        url = '//' + (netloc or '') + url
234    if scheme:
235        url = scheme + ':' + url
236    if query:
237        url = url + '?' + query
238    if fragment:
239        url = url + '#' + fragment
240    return url
241
242def urljoin(base, url, allow_fragments=True):
243    """Join a base URL and a possibly relative URL to form an absolute
244    interpretation of the latter."""
245    if not base:
246        return url
247    if not url:
248        return base
249    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
250            urlparse(base, '', allow_fragments)
251    scheme, netloc, path, params, query, fragment = \
252            urlparse(url, bscheme, allow_fragments)
253    if scheme != bscheme or scheme not in uses_relative:
254        return url
255    if scheme in uses_netloc:
256        if netloc:
257            return urlunparse((scheme, netloc, path,
258                               params, query, fragment))
259        netloc = bnetloc
260    if path[:1] == '/':
261        return urlunparse((scheme, netloc, path,
262                           params, query, fragment))
263    if not path and not params:
264        path = bpath
265        params = bparams
266        if not query:
267            query = bquery
268        return urlunparse((scheme, netloc, path,
269                           params, query, fragment))
270    segments = bpath.split('/')[:-1] + path.split('/')
271    # XXX The stuff below is bogus in various ways...
272    if segments[-1] == '.':
273        segments[-1] = ''
274    while '.' in segments:
275        segments.remove('.')
276    while 1:
277        i = 1
278        n = len(segments) - 1
279        while i < n:
280            if (segments[i] == '..'
281                and segments[i-1] not in ('', '..')):
282                del segments[i-1:i+1]
283                break
284            i = i+1
285        else:
286            break
287    if segments == ['', '..']:
288        segments[-1] = ''
289    elif len(segments) >= 2 and segments[-1] == '..':
290        segments[-2:] = ['']
291    return urlunparse((scheme, netloc, '/'.join(segments),
292                       params, query, fragment))
293
294def urldefrag(url):
295    """Removes any existing fragment from URL.
296
297    Returns a tuple of the defragmented URL and the fragment.  If
298    the URL contained no fragments, the second element is the
299    empty string.
300    """
301    if '#' in url:
302        s, n, p, a, q, frag = urlparse(url)
303        defrag = urlunparse((s, n, p, a, q, ''))
304        return defrag, frag
305    else:
306        return url, ''
307
308# unquote method for parse_qs and parse_qsl
309# Cannot use directly from urllib as it would create a circular reference
310# because urllib uses urlparse methods (urljoin).  If you update this function,
311# update it also in urllib.  This code duplication does not existin in Python3.
312
313_hexdig = '0123456789ABCDEFabcdef'
314_hextochr = dict((a+b, chr(int(a+b,16)))
315                 for a in _hexdig for b in _hexdig)
316
317def unquote(s):
318    """unquote('abc%20def') -> 'abc def'."""
319    res = s.split('%')
320    # fastpath
321    if len(res) == 1:
322        return s
323    s = res[0]
324    for item in res[1:]:
325        try:
326            s += _hextochr[item[:2]] + item[2:]
327        except KeyError:
328            s += '%' + item
329        except UnicodeDecodeError:
330            s += unichr(int(item[:2], 16)) + item[2:]
331    return s
332
333def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
334    """Parse a query given as a string argument.
335
336        Arguments:
337
338        qs: percent-encoded query string to be parsed
339
340        keep_blank_values: flag indicating whether blank values in
341            percent-encoded queries should be treated as blank strings.
342            A true value indicates that blanks should be retained as
343            blank strings.  The default false value indicates that
344            blank values are to be ignored and treated as if they were
345            not included.
346
347        strict_parsing: flag indicating what to do with parsing errors.
348            If false (the default), errors are silently ignored.
349            If true, errors raise a ValueError exception.
350    """
351    dict = {}
352    for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
353        if name in dict:
354            dict[name].append(value)
355        else:
356            dict[name] = [value]
357    return dict
358
359def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
360    """Parse a query given as a string argument.
361
362    Arguments:
363
364    qs: percent-encoded query string to be parsed
365
366    keep_blank_values: flag indicating whether blank values in
367        percent-encoded queries should be treated as blank strings.  A
368        true value indicates that blanks should be retained as blank
369        strings.  The default false value indicates that blank values
370        are to be ignored and treated as if they were  not included.
371
372    strict_parsing: flag indicating what to do with parsing errors. If
373        false (the default), errors are silently ignored. If true,
374        errors raise a ValueError exception.
375
376    Returns a list, as G-d intended.
377    """
378    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
379    r = []
380    for name_value in pairs:
381        if not name_value and not strict_parsing:
382            continue
383        nv = name_value.split('=', 1)
384        if len(nv) != 2:
385            if strict_parsing:
386                raise ValueError, "bad query field: %r" % (name_value,)
387            # Handle case of a control-name with no equal sign
388            if keep_blank_values:
389                nv.append('')
390            else:
391                continue
392        if len(nv[1]) or keep_blank_values:
393            name = unquote(nv[0].replace('+', ' '))
394            value = unquote(nv[1].replace('+', ' '))
395            r.append((name, value))
396
397    return r
398