• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L.  Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it.  The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28
29"""
30
31import re
32
33__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
34           "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
35
36# A classification of schemes ('' means apply by default)
37uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
38                 'wais', 'file', 'https', 'shttp', 'mms',
39                 'prospero', 'rtsp', 'rtspu', '', 'sftp',
40                 'svn', 'svn+ssh']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
44               'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47               'mms', '', 'sftp', 'tel']
48
49# These are not actually used anymore, but should stay for backwards
50# compatibility.  (They are undocumented, but have a public-looking name.)
51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
52                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
54              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
56                 'nntp', 'wais', 'https', 'shttp', 'snews',
57                 'file', 'prospero', '']
58
59# Characters valid in scheme names
60scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
61                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
62                '0123456789'
63                '+-.')
64
65MAX_CACHE_SIZE = 20
66_parse_cache = {}
67
68def clear_cache():
69    """Clear the parse cache."""
70    _parse_cache.clear()
71
72
73class ResultMixin(object):
74    """Shared methods for the parsed result objects."""
75
76    @property
77    def username(self):
78        netloc = self.netloc
79        if "@" in netloc:
80            userinfo = netloc.rsplit("@", 1)[0]
81            if ":" in userinfo:
82                userinfo = userinfo.split(":", 1)[0]
83            return userinfo
84        return None
85
86    @property
87    def password(self):
88        netloc = self.netloc
89        if "@" in netloc:
90            userinfo = netloc.rsplit("@", 1)[0]
91            if ":" in userinfo:
92                return userinfo.split(":", 1)[1]
93        return None
94
95    @property
96    def hostname(self):
97        netloc = self.netloc.split('@')[-1]
98        if '[' in netloc and ']' in netloc:
99            return netloc.split(']')[0][1:].lower()
100        elif ':' in netloc:
101            return netloc.split(':')[0].lower()
102        elif netloc == '':
103            return None
104        else:
105            return netloc.lower()
106
107    @property
108    def port(self):
109        netloc = self.netloc.split('@')[-1].split(']')[-1]
110        if ':' in netloc:
111            port = netloc.split(':')[1]
112            if port:
113                port = int(port, 10)
114                # verify legal port
115                if (0 <= port <= 65535):
116                    return port
117        return None
118
119from collections import namedtuple
120
121class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
122
123    __slots__ = ()
124
125    def geturl(self):
126        return urlunsplit(self)
127
128
129class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
130
131    __slots__ = ()
132
133    def geturl(self):
134        return urlunparse(self)
135
136
137def urlparse(url, scheme='', allow_fragments=True):
138    """Parse a URL into 6 components:
139    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
140    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
141    Note that we don't break the components up in smaller bits
142    (e.g. netloc is a single string) and we don't expand % escapes."""
143    tuple = urlsplit(url, scheme, allow_fragments)
144    scheme, netloc, url, query, fragment = tuple
145    if scheme in uses_params and ';' in url:
146        url, params = _splitparams(url)
147    else:
148        params = ''
149    return ParseResult(scheme, netloc, url, params, query, fragment)
150
151def _splitparams(url):
152    if '/'  in url:
153        i = url.find(';', url.rfind('/'))
154        if i < 0:
155            return url, ''
156    else:
157        i = url.find(';')
158    return url[:i], url[i+1:]
159
160def _splitnetloc(url, start=0):
161    delim = len(url)   # position of end of domain part of url, default is end
162    for c in '/?#':    # look for delimiters; the order is NOT important
163        wdelim = url.find(c, start)        # find first of this delim
164        if wdelim >= 0:                    # if found
165            delim = min(delim, wdelim)     # use earliest delim position
166    return url[start:delim], url[delim:]   # return (domain, rest)
167
168def urlsplit(url, scheme='', allow_fragments=True):
169    """Parse a URL into 5 components:
170    <scheme>://<netloc>/<path>?<query>#<fragment>
171    Return a 5-tuple: (scheme, netloc, path, query, fragment).
172    Note that we don't break the components up in smaller bits
173    (e.g. netloc is a single string) and we don't expand % escapes."""
174    allow_fragments = bool(allow_fragments)
175    key = url, scheme, allow_fragments, type(url), type(scheme)
176    cached = _parse_cache.get(key, None)
177    if cached:
178        return cached
179    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
180        clear_cache()
181    netloc = query = fragment = ''
182    i = url.find(':')
183    if i > 0:
184        if url[:i] == 'http': # optimize the common case
185            scheme = url[:i].lower()
186            url = url[i+1:]
187            if url[:2] == '//':
188                netloc, url = _splitnetloc(url, 2)
189                if (('[' in netloc and ']' not in netloc) or
190                        (']' in netloc and '[' not in netloc)):
191                    raise ValueError("Invalid IPv6 URL")
192            if allow_fragments and '#' in url:
193                url, fragment = url.split('#', 1)
194            if '?' in url:
195                url, query = url.split('?', 1)
196            v = SplitResult(scheme, netloc, url, query, fragment)
197            _parse_cache[key] = v
198            return v
199        for c in url[:i]:
200            if c not in scheme_chars:
201                break
202        else:
203            # make sure "url" is not actually a port number (in which case
204            # "scheme" is really part of the path)
205            rest = url[i+1:]
206            if not rest or any(c not in '0123456789' for c in rest):
207                # not a port number
208                scheme, url = url[:i].lower(), rest
209
210    if url[:2] == '//':
211        netloc, url = _splitnetloc(url, 2)
212        if (('[' in netloc and ']' not in netloc) or
213                (']' in netloc and '[' not in netloc)):
214            raise ValueError("Invalid IPv6 URL")
215    if allow_fragments and '#' in url:
216        url, fragment = url.split('#', 1)
217    if '?' in url:
218        url, query = url.split('?', 1)
219    v = SplitResult(scheme, netloc, url, query, fragment)
220    _parse_cache[key] = v
221    return v
222
223def urlunparse(data):
224    """Put a parsed URL back together again.  This may result in a
225    slightly different, but equivalent URL, if the URL that was parsed
226    originally had redundant delimiters, e.g. a ? with an empty query
227    (the draft states that these are equivalent)."""
228    scheme, netloc, url, params, query, fragment = data
229    if params:
230        url = "%s;%s" % (url, params)
231    return urlunsplit((scheme, netloc, url, query, fragment))
232
233def urlunsplit(data):
234    """Combine the elements of a tuple as returned by urlsplit() into a
235    complete URL as a string. The data argument can be any five-item iterable.
236    This may result in a slightly different, but equivalent URL, if the URL that
237    was parsed originally had unnecessary delimiters (for example, a ? with an
238    empty query; the RFC states that these are equivalent)."""
239    scheme, netloc, url, query, fragment = data
240    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
241        if url and url[:1] != '/': url = '/' + url
242        url = '//' + (netloc or '') + url
243    if scheme:
244        url = scheme + ':' + url
245    if query:
246        url = url + '?' + query
247    if fragment:
248        url = url + '#' + fragment
249    return url
250
251def urljoin(base, url, allow_fragments=True):
252    """Join a base URL and a possibly relative URL to form an absolute
253    interpretation of the latter."""
254    if not base:
255        return url
256    if not url:
257        return base
258    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
259            urlparse(base, '', allow_fragments)
260    scheme, netloc, path, params, query, fragment = \
261            urlparse(url, bscheme, allow_fragments)
262    if scheme != bscheme or scheme not in uses_relative:
263        return url
264    if scheme in uses_netloc:
265        if netloc:
266            return urlunparse((scheme, netloc, path,
267                               params, query, fragment))
268        netloc = bnetloc
269    if path[:1] == '/':
270        return urlunparse((scheme, netloc, path,
271                           params, query, fragment))
272    if not path and not params:
273        path = bpath
274        params = bparams
275        if not query:
276            query = bquery
277        return urlunparse((scheme, netloc, path,
278                           params, query, fragment))
279    segments = bpath.split('/')[:-1] + path.split('/')
280    # XXX The stuff below is bogus in various ways...
281    if segments[-1] == '.':
282        segments[-1] = ''
283    while '.' in segments:
284        segments.remove('.')
285    while 1:
286        i = 1
287        n = len(segments) - 1
288        while i < n:
289            if (segments[i] == '..'
290                and segments[i-1] not in ('', '..')):
291                del segments[i-1:i+1]
292                break
293            i = i+1
294        else:
295            break
296    if segments == ['', '..']:
297        segments[-1] = ''
298    elif len(segments) >= 2 and segments[-1] == '..':
299        segments[-2:] = ['']
300    return urlunparse((scheme, netloc, '/'.join(segments),
301                       params, query, fragment))
302
303def urldefrag(url):
304    """Removes any existing fragment from URL.
305
306    Returns a tuple of the defragmented URL and the fragment.  If
307    the URL contained no fragments, the second element is the
308    empty string.
309    """
310    if '#' in url:
311        s, n, p, a, q, frag = urlparse(url)
312        defrag = urlunparse((s, n, p, a, q, ''))
313        return defrag, frag
314    else:
315        return url, ''
316
317try:
318    unicode
319except NameError:
320    def _is_unicode(x):
321        return 0
322else:
323    def _is_unicode(x):
324        return isinstance(x, unicode)
325
326# unquote method for parse_qs and parse_qsl
327# Cannot use directly from urllib as it would create a circular reference
328# because urllib uses urlparse methods (urljoin).  If you update this function,
329# update it also in urllib.  This code duplication does not existin in Python3.
330
331_hexdig = '0123456789ABCDEFabcdef'
332_hextochr = dict((a+b, chr(int(a+b,16)))
333                 for a in _hexdig for b in _hexdig)
334_asciire = re.compile('([\x00-\x7f]+)')
335
336def unquote(s):
337    """unquote('abc%20def') -> 'abc def'."""
338    if _is_unicode(s):
339        if '%' not in s:
340            return s
341        bits = _asciire.split(s)
342        res = [bits[0]]
343        append = res.append
344        for i in range(1, len(bits), 2):
345            append(unquote(str(bits[i])).decode('latin1'))
346            append(bits[i + 1])
347        return ''.join(res)
348
349    bits = s.split('%')
350    # fastpath
351    if len(bits) == 1:
352        return s
353    res = [bits[0]]
354    append = res.append
355    for item in bits[1:]:
356        try:
357            append(_hextochr[item[:2]])
358            append(item[2:])
359        except KeyError:
360            append('%')
361            append(item)
362    return ''.join(res)
363
364def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
365    """Parse a query given as a string argument.
366
367        Arguments:
368
369        qs: percent-encoded query string to be parsed
370
371        keep_blank_values: flag indicating whether blank values in
372            percent-encoded queries should be treated as blank strings.
373            A true value indicates that blanks should be retained as
374            blank strings.  The default false value indicates that
375            blank values are to be ignored and treated as if they were
376            not included.
377
378        strict_parsing: flag indicating what to do with parsing errors.
379            If false (the default), errors are silently ignored.
380            If true, errors raise a ValueError exception.
381    """
382    dict = {}
383    for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
384        if name in dict:
385            dict[name].append(value)
386        else:
387            dict[name] = [value]
388    return dict
389
390def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
391    """Parse a query given as a string argument.
392
393    Arguments:
394
395    qs: percent-encoded query string to be parsed
396
397    keep_blank_values: flag indicating whether blank values in
398        percent-encoded queries should be treated as blank strings.  A
399        true value indicates that blanks should be retained as blank
400        strings.  The default false value indicates that blank values
401        are to be ignored and treated as if they were  not included.
402
403    strict_parsing: flag indicating what to do with parsing errors. If
404        false (the default), errors are silently ignored. If true,
405        errors raise a ValueError exception.
406
407    Returns a list, as G-d intended.
408    """
409    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
410    r = []
411    for name_value in pairs:
412        if not name_value and not strict_parsing:
413            continue
414        nv = name_value.split('=', 1)
415        if len(nv) != 2:
416            if strict_parsing:
417                raise ValueError, "bad query field: %r" % (name_value,)
418            # Handle case of a control-name with no equal sign
419            if keep_blank_values:
420                nv.append('')
421            else:
422                continue
423        if len(nv[1]) or keep_blank_values:
424            name = unquote(nv[0].replace('+', ' '))
425            value = unquote(nv[1].replace('+', ' '))
426            r.append((name, value))
427
428    return r
429