• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# coding: utf-8
2
3"""
4Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
5the following items:
6
7 - iri_to_uri()
8 - uri_to_iri()
9"""
10
11from __future__ import unicode_literals, division, absolute_import, print_function
12
13from encodings import idna  # noqa
14import codecs
15import re
16import sys
17
18from ._errors import unwrap
19from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
20
21if sys.version_info < (3,):
22    from urlparse import urlsplit, urlunsplit
23    from urllib import (
24        quote as urlquote,
25        unquote as unquote_to_bytes,
26    )
27
28else:
29    from urllib.parse import (
30        quote as urlquote,
31        unquote_to_bytes,
32        urlsplit,
33        urlunsplit,
34    )
35
36
37def iri_to_uri(value, normalize=False):
38    """
39    Encodes a unicode IRI into an ASCII byte string URI
40
41    :param value:
42        A unicode string of an IRI
43
44    :param normalize:
45        A bool that controls URI normalization
46
47    :return:
48        A byte string of the ASCII-encoded URI
49    """
50
51    if not isinstance(value, str_cls):
52        raise TypeError(unwrap(
53            '''
54            value must be a unicode string, not %s
55            ''',
56            type_name(value)
57        ))
58
59    scheme = None
60    # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
61    if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
62        real_prefix = None
63        prefix_match = re.match('^[^:]*://', value)
64        if prefix_match:
65            real_prefix = prefix_match.group(0)
66            value = 'http://' + value[len(real_prefix):]
67        parsed = urlsplit(value)
68        if real_prefix:
69            value = real_prefix + value[7:]
70            scheme = _urlquote(real_prefix[:-3])
71    else:
72        parsed = urlsplit(value)
73
74    if scheme is None:
75        scheme = _urlquote(parsed.scheme)
76    hostname = parsed.hostname
77    if hostname is not None:
78        hostname = hostname.encode('idna')
79    # RFC 3986 allows userinfo to contain sub-delims
80    username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
81    password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
82    port = parsed.port
83    if port is not None:
84        port = str_cls(port).encode('ascii')
85
86    netloc = b''
87    if username is not None:
88        netloc += username
89        if password:
90            netloc += b':' + password
91        netloc += b'@'
92    if hostname is not None:
93        netloc += hostname
94    if port is not None:
95        default_http = scheme == b'http' and port == b'80'
96        default_https = scheme == b'https' and port == b'443'
97        if not normalize or (not default_http and not default_https):
98            netloc += b':' + port
99
100    # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
101    path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
102    # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
103    query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
104    # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
105    fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
106
107    if normalize and query is None and fragment is None and path == b'/':
108        path = None
109
110    # Python 2.7 compat
111    if path is None:
112        path = ''
113
114    output = urlunsplit((scheme, netloc, path, query, fragment))
115    if isinstance(output, str_cls):
116        output = output.encode('latin1')
117    return output
118
119
120def uri_to_iri(value):
121    """
122    Converts an ASCII URI byte string into a unicode IRI
123
124    :param value:
125        An ASCII-encoded byte string of the URI
126
127    :return:
128        A unicode string of the IRI
129    """
130
131    if not isinstance(value, byte_cls):
132        raise TypeError(unwrap(
133            '''
134            value must be a byte string, not %s
135            ''',
136            type_name(value)
137        ))
138
139    parsed = urlsplit(value)
140
141    scheme = parsed.scheme
142    if scheme is not None:
143        scheme = scheme.decode('ascii')
144
145    username = _urlunquote(parsed.username, remap=[':', '@'])
146    password = _urlunquote(parsed.password, remap=[':', '@'])
147    hostname = parsed.hostname
148    if hostname:
149        hostname = hostname.decode('idna')
150    port = parsed.port
151    if port and not isinstance(port, int_types):
152        port = port.decode('ascii')
153
154    netloc = ''
155    if username is not None:
156        netloc += username
157        if password:
158            netloc += ':' + password
159        netloc += '@'
160    if hostname is not None:
161        netloc += hostname
162    if port is not None:
163        netloc += ':' + str_cls(port)
164
165    path = _urlunquote(parsed.path, remap=['/'], preserve=True)
166    query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
167    fragment = _urlunquote(parsed.fragment)
168
169    return urlunsplit((scheme, netloc, path, query, fragment))
170
171
172def _iri_utf8_errors_handler(exc):
173    """
174    Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
175    sequences encoded in %XX format, but as part of a unicode string.
176
177    :param exc:
178        The UnicodeDecodeError exception
179
180    :return:
181        A 2-element tuple of (replacement unicode string, integer index to
182        resume at)
183    """
184
185    bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
186    replacements = ['%%%02x' % num for num in bytes_as_ints]
187    return (''.join(replacements), exc.end)
188
189
190codecs.register_error('iriutf8', _iri_utf8_errors_handler)
191
192
193def _urlquote(string, safe=''):
194    """
195    Quotes a unicode string for use in a URL
196
197    :param string:
198        A unicode string
199
200    :param safe:
201        A unicode string of character to not encode
202
203    :return:
204        None (if string is None) or an ASCII byte string of the quoted string
205    """
206
207    if string is None or string == '':
208        return None
209
210    # Anything already hex quoted is pulled out of the URL and unquoted if
211    # possible
212    escapes = []
213    if re.search('%[0-9a-fA-F]{2}', string):
214        # Try to unquote any percent values, restoring them if they are not
215        # valid UTF-8. Also, requote any safe chars since encoded versions of
216        # those are functionally different than the unquoted ones.
217        def _try_unescape(match):
218            byte_string = unquote_to_bytes(match.group(0))
219            unicode_string = byte_string.decode('utf-8', 'iriutf8')
220            for safe_char in list(safe):
221                unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
222            return unicode_string
223        string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
224
225        # Once we have the minimal set of hex quoted values, removed them from
226        # the string so that they are not double quoted
227        def _extract_escape(match):
228            escapes.append(match.group(0).encode('ascii'))
229            return '\x00'
230        string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
231
232    output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
233    if not isinstance(output, byte_cls):
234        output = output.encode('ascii')
235
236    # Restore the existing quoted values that we extracted
237    if len(escapes) > 0:
238        def _return_escape(_):
239            return escapes.pop(0)
240        output = re.sub(b'%00', _return_escape, output)
241
242    return output
243
244
245def _urlunquote(byte_string, remap=None, preserve=None):
246    """
247    Unquotes a URI portion from a byte string into unicode using UTF-8
248
249    :param byte_string:
250        A byte string of the data to unquote
251
252    :param remap:
253        A list of characters (as unicode) that should be re-mapped to a
254        %XX encoding. This is used when characters are not valid in part of a
255        URL.
256
257    :param preserve:
258        A bool - indicates that the chars to be remapped if they occur in
259        non-hex form, should be preserved. E.g. / for URL path.
260
261    :return:
262        A unicode string
263    """
264
265    if byte_string is None:
266        return byte_string
267
268    if byte_string == b'':
269        return ''
270
271    if preserve:
272        replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
273        preserve_unmap = {}
274        for char in remap:
275            replacement = replacements.pop(0)
276            preserve_unmap[replacement] = char
277            byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
278
279    byte_string = unquote_to_bytes(byte_string)
280
281    if remap:
282        for char in remap:
283            byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
284
285    output = byte_string.decode('utf-8', 'iriutf8')
286
287    if preserve:
288        for replacement, original in preserve_unmap.items():
289            output = output.replace(replacement, original)
290
291    return output
292