1# coding: utf-8 2 3""" 4Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports 5the following items: 6 7 - iri_to_uri() 8 - uri_to_iri() 9""" 10 11from __future__ import unicode_literals, division, absolute_import, print_function 12 13from encodings import idna # noqa 14import codecs 15import re 16import sys 17 18from ._errors import unwrap 19from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types 20 21if sys.version_info < (3,): 22 from urlparse import urlsplit, urlunsplit 23 from urllib import ( 24 quote as urlquote, 25 unquote as unquote_to_bytes, 26 ) 27 28else: 29 from urllib.parse import ( 30 quote as urlquote, 31 unquote_to_bytes, 32 urlsplit, 33 urlunsplit, 34 ) 35 36 37def iri_to_uri(value, normalize=False): 38 """ 39 Encodes a unicode IRI into an ASCII byte string URI 40 41 :param value: 42 A unicode string of an IRI 43 44 :param normalize: 45 A bool that controls URI normalization 46 47 :return: 48 A byte string of the ASCII-encoded URI 49 """ 50 51 if not isinstance(value, str_cls): 52 raise TypeError(unwrap( 53 ''' 54 value must be a unicode string, not %s 55 ''', 56 type_name(value) 57 )) 58 59 scheme = None 60 # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https:// 61 if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'): 62 real_prefix = None 63 prefix_match = re.match('^[^:]*://', value) 64 if prefix_match: 65 real_prefix = prefix_match.group(0) 66 value = 'http://' + value[len(real_prefix):] 67 parsed = urlsplit(value) 68 if real_prefix: 69 value = real_prefix + value[7:] 70 scheme = _urlquote(real_prefix[:-3]) 71 else: 72 parsed = urlsplit(value) 73 74 if scheme is None: 75 scheme = _urlquote(parsed.scheme) 76 hostname = parsed.hostname 77 if hostname is not None: 78 hostname = hostname.encode('idna') 79 # RFC 3986 allows userinfo to contain sub-delims 80 username = _urlquote(parsed.username, safe='!$&\'()*+,;=') 81 password = _urlquote(parsed.password, safe='!$&\'()*+,;=') 82 port = parsed.port 83 if port is not None: 84 port = str_cls(port).encode('ascii') 85 86 netloc = b'' 87 if username is not None: 88 netloc += username 89 if password: 90 netloc += b':' + password 91 netloc += b'@' 92 if hostname is not None: 93 netloc += hostname 94 if port is not None: 95 default_http = scheme == b'http' and port == b'80' 96 default_https = scheme == b'https' and port == b'443' 97 if not normalize or (not default_http and not default_https): 98 netloc += b':' + port 99 100 # RFC 3986 allows a path to contain sub-delims, plus "@" and ":" 101 path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:') 102 # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?" 103 query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:') 104 # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?" 105 fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:') 106 107 if normalize and query is None and fragment is None and path == b'/': 108 path = None 109 110 # Python 2.7 compat 111 if path is None: 112 path = '' 113 114 output = urlunsplit((scheme, netloc, path, query, fragment)) 115 if isinstance(output, str_cls): 116 output = output.encode('latin1') 117 return output 118 119 120def uri_to_iri(value): 121 """ 122 Converts an ASCII URI byte string into a unicode IRI 123 124 :param value: 125 An ASCII-encoded byte string of the URI 126 127 :return: 128 A unicode string of the IRI 129 """ 130 131 if not isinstance(value, byte_cls): 132 raise TypeError(unwrap( 133 ''' 134 value must be a byte string, not %s 135 ''', 136 type_name(value) 137 )) 138 139 parsed = urlsplit(value) 140 141 scheme = parsed.scheme 142 if scheme is not None: 143 scheme = scheme.decode('ascii') 144 145 username = _urlunquote(parsed.username, remap=[':', '@']) 146 password = _urlunquote(parsed.password, remap=[':', '@']) 147 hostname = parsed.hostname 148 if hostname: 149 hostname = hostname.decode('idna') 150 port = parsed.port 151 if port and not isinstance(port, int_types): 152 port = port.decode('ascii') 153 154 netloc = '' 155 if username is not None: 156 netloc += username 157 if password: 158 netloc += ':' + password 159 netloc += '@' 160 if hostname is not None: 161 netloc += hostname 162 if port is not None: 163 netloc += ':' + str_cls(port) 164 165 path = _urlunquote(parsed.path, remap=['/'], preserve=True) 166 query = _urlunquote(parsed.query, remap=['&', '='], preserve=True) 167 fragment = _urlunquote(parsed.fragment) 168 169 return urlunsplit((scheme, netloc, path, query, fragment)) 170 171 172def _iri_utf8_errors_handler(exc): 173 """ 174 Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte 175 sequences encoded in %XX format, but as part of a unicode string. 176 177 :param exc: 178 The UnicodeDecodeError exception 179 180 :return: 181 A 2-element tuple of (replacement unicode string, integer index to 182 resume at) 183 """ 184 185 bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end]) 186 replacements = ['%%%02x' % num for num in bytes_as_ints] 187 return (''.join(replacements), exc.end) 188 189 190codecs.register_error('iriutf8', _iri_utf8_errors_handler) 191 192 193def _urlquote(string, safe=''): 194 """ 195 Quotes a unicode string for use in a URL 196 197 :param string: 198 A unicode string 199 200 :param safe: 201 A unicode string of character to not encode 202 203 :return: 204 None (if string is None) or an ASCII byte string of the quoted string 205 """ 206 207 if string is None or string == '': 208 return None 209 210 # Anything already hex quoted is pulled out of the URL and unquoted if 211 # possible 212 escapes = [] 213 if re.search('%[0-9a-fA-F]{2}', string): 214 # Try to unquote any percent values, restoring them if they are not 215 # valid UTF-8. Also, requote any safe chars since encoded versions of 216 # those are functionally different than the unquoted ones. 217 def _try_unescape(match): 218 byte_string = unquote_to_bytes(match.group(0)) 219 unicode_string = byte_string.decode('utf-8', 'iriutf8') 220 for safe_char in list(safe): 221 unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char)) 222 return unicode_string 223 string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string) 224 225 # Once we have the minimal set of hex quoted values, removed them from 226 # the string so that they are not double quoted 227 def _extract_escape(match): 228 escapes.append(match.group(0).encode('ascii')) 229 return '\x00' 230 string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string) 231 232 output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8')) 233 if not isinstance(output, byte_cls): 234 output = output.encode('ascii') 235 236 # Restore the existing quoted values that we extracted 237 if len(escapes) > 0: 238 def _return_escape(_): 239 return escapes.pop(0) 240 output = re.sub(b'%00', _return_escape, output) 241 242 return output 243 244 245def _urlunquote(byte_string, remap=None, preserve=None): 246 """ 247 Unquotes a URI portion from a byte string into unicode using UTF-8 248 249 :param byte_string: 250 A byte string of the data to unquote 251 252 :param remap: 253 A list of characters (as unicode) that should be re-mapped to a 254 %XX encoding. This is used when characters are not valid in part of a 255 URL. 256 257 :param preserve: 258 A bool - indicates that the chars to be remapped if they occur in 259 non-hex form, should be preserved. E.g. / for URL path. 260 261 :return: 262 A unicode string 263 """ 264 265 if byte_string is None: 266 return byte_string 267 268 if byte_string == b'': 269 return '' 270 271 if preserve: 272 replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F'] 273 preserve_unmap = {} 274 for char in remap: 275 replacement = replacements.pop(0) 276 preserve_unmap[replacement] = char 277 byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii')) 278 279 byte_string = unquote_to_bytes(byte_string) 280 281 if remap: 282 for char in remap: 283 byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii')) 284 285 output = byte_string.decode('utf-8', 'iriutf8') 286 287 if preserve: 288 for replacement, original in preserve_unmap.items(): 289 output = output.replace(replacement, original) 290 291 return output 292