1# 2# Cython -- encoding related tools 3# 4 5import re 6import sys 7 8if sys.version_info[0] >= 3: 9 _unicode, _str, _bytes = str, str, bytes 10 IS_PYTHON3 = True 11else: 12 _unicode, _str, _bytes = unicode, str, str 13 IS_PYTHON3 = False 14 15empty_bytes = _bytes() 16empty_unicode = _unicode() 17 18join_bytes = empty_bytes.join 19 20class UnicodeLiteralBuilder(object): 21 """Assemble a unicode string. 22 """ 23 def __init__(self): 24 self.chars = [] 25 26 def append(self, characters): 27 if isinstance(characters, _bytes): 28 # this came from a Py2 string literal in the parser code 29 characters = characters.decode("ASCII") 30 assert isinstance(characters, _unicode), str(type(characters)) 31 self.chars.append(characters) 32 33 if sys.maxunicode == 65535: 34 def append_charval(self, char_number): 35 if char_number > 65535: 36 # wide Unicode character on narrow platform => replace 37 # by surrogate pair 38 char_number -= 0x10000 39 self.chars.append( unichr((char_number // 1024) + 0xD800) ) 40 self.chars.append( unichr((char_number % 1024) + 0xDC00) ) 41 else: 42 self.chars.append( unichr(char_number) ) 43 else: 44 def append_charval(self, char_number): 45 self.chars.append( unichr(char_number) ) 46 47 def append_uescape(self, char_number, escape_string): 48 self.append_charval(char_number) 49 50 def getstring(self): 51 return EncodedString(u''.join(self.chars)) 52 53 def getstrings(self): 54 return (None, self.getstring()) 55 56 57class BytesLiteralBuilder(object): 58 """Assemble a byte string or char value. 59 """ 60 def __init__(self, target_encoding): 61 self.chars = [] 62 self.target_encoding = target_encoding 63 64 def append(self, characters): 65 if isinstance(characters, _unicode): 66 characters = characters.encode(self.target_encoding) 67 assert isinstance(characters, _bytes), str(type(characters)) 68 self.chars.append(characters) 69 70 def append_charval(self, char_number): 71 self.chars.append( unichr(char_number).encode('ISO-8859-1') ) 72 73 def append_uescape(self, char_number, escape_string): 74 self.append(escape_string) 75 76 def getstring(self): 77 # this *must* return a byte string! 78 s = BytesLiteral(join_bytes(self.chars)) 79 s.encoding = self.target_encoding 80 return s 81 82 def getchar(self): 83 # this *must* return a byte string! 84 return self.getstring() 85 86 def getstrings(self): 87 return (self.getstring(), None) 88 89class StrLiteralBuilder(object): 90 """Assemble both a bytes and a unicode representation of a string. 91 """ 92 def __init__(self, target_encoding): 93 self._bytes = BytesLiteralBuilder(target_encoding) 94 self._unicode = UnicodeLiteralBuilder() 95 96 def append(self, characters): 97 self._bytes.append(characters) 98 self._unicode.append(characters) 99 100 def append_charval(self, char_number): 101 self._bytes.append_charval(char_number) 102 self._unicode.append_charval(char_number) 103 104 def append_uescape(self, char_number, escape_string): 105 self._bytes.append(escape_string) 106 self._unicode.append_charval(char_number) 107 108 def getstrings(self): 109 return (self._bytes.getstring(), self._unicode.getstring()) 110 111 112class EncodedString(_unicode): 113 # unicode string subclass to keep track of the original encoding. 114 # 'encoding' is None for unicode strings and the source encoding 115 # otherwise 116 encoding = None 117 118 def __deepcopy__(self, memo): 119 return self 120 121 def byteencode(self): 122 assert self.encoding is not None 123 return self.encode(self.encoding) 124 125 def utf8encode(self): 126 assert self.encoding is None 127 return self.encode("UTF-8") 128 129 @property 130 def is_unicode(self): 131 return self.encoding is None 132 133 def contains_surrogates(self): 134 return string_contains_surrogates(self) 135 136 137def string_contains_surrogates(ustring): 138 """ 139 Check if the unicode string contains surrogate code points 140 on a CPython platform with wide (UCS-4) or narrow (UTF-16) 141 Unicode, i.e. characters that would be spelled as two 142 separate code units on a narrow platform. 143 """ 144 for c in map(ord, ustring): 145 if c > 65535: # can only happen on wide platforms 146 return True 147 if 0xD800 <= c <= 0xDFFF: 148 return True 149 return False 150 151 152class BytesLiteral(_bytes): 153 # bytes subclass that is compatible with EncodedString 154 encoding = None 155 156 def __deepcopy__(self, memo): 157 return self 158 159 def byteencode(self): 160 if IS_PYTHON3: 161 return _bytes(self) 162 else: 163 # fake-recode the string to make it a plain bytes object 164 return self.decode('ISO-8859-1').encode('ISO-8859-1') 165 166 def utf8encode(self): 167 assert False, "this is not a unicode string: %r" % self 168 169 def __str__(self): 170 """Fake-decode the byte string to unicode to support % 171 formatting of unicode strings. 172 """ 173 return self.decode('ISO-8859-1') 174 175 is_unicode = False 176 177 178char_from_escape_sequence = { 179 r'\a' : u'\a', 180 r'\b' : u'\b', 181 r'\f' : u'\f', 182 r'\n' : u'\n', 183 r'\r' : u'\r', 184 r'\t' : u'\t', 185 r'\v' : u'\v', 186 }.get 187 188_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) 189 190 191def _to_escape_sequence(s): 192 if s in '\n\r\t': 193 return repr(s)[1:-1] 194 elif s == '"': 195 return r'\"' 196 elif s == '\\': 197 return r'\\' 198 else: 199 # within a character sequence, oct passes much better than hex 200 return ''.join(['\\%03o' % ord(c) for c in s]) 201 202 203def _build_specials_replacer(): 204 subexps = [] 205 replacements = {} 206 for special in _c_special: 207 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) 208 subexps.append(regexp) 209 replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') 210 sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub 211 def replace_specials(m): 212 return replacements[m.group(1)] 213 def replace(s): 214 return sub(replace_specials, s) 215 return replace 216 217_replace_specials = _build_specials_replacer() 218 219 220def escape_char(c): 221 if IS_PYTHON3: 222 c = c.decode('ISO-8859-1') 223 if c in '\n\r\t\\': 224 return repr(c)[1:-1] 225 elif c == "'": 226 return "\\'" 227 n = ord(c) 228 if n < 32 or n > 127: 229 # hex works well for characters 230 return "\\x%02X" % n 231 else: 232 return c 233 234def escape_byte_string(s): 235 """Escape a byte string so that it can be written into C code. 236 Note that this returns a Unicode string instead which, when 237 encoded as ISO-8859-1, will result in the correct byte sequence 238 being written. 239 """ 240 s = _replace_specials(s) 241 try: 242 return s.decode("ASCII") # trial decoding: plain ASCII => done 243 except UnicodeDecodeError: 244 pass 245 if IS_PYTHON3: 246 s_new = bytearray() 247 append, extend = s_new.append, s_new.extend 248 for b in s: 249 if b >= 128: 250 extend(('\\%3o' % b).encode('ASCII')) 251 else: 252 append(b) 253 return s_new.decode('ISO-8859-1') 254 else: 255 l = [] 256 append = l.append 257 for c in s: 258 o = ord(c) 259 if o >= 128: 260 append('\\%3o' % o) 261 else: 262 append(c) 263 return join_bytes(l).decode('ISO-8859-1') 264 265def split_string_literal(s, limit=2000): 266 # MSVC can't handle long string literals. 267 if len(s) < limit: 268 return s 269 else: 270 start = 0 271 chunks = [] 272 while start < len(s): 273 end = start + limit 274 if len(s) > end-4 and '\\' in s[end-4:end]: 275 end -= 4 - s[end-4:end].find('\\') # just before the backslash 276 while s[end-1] == '\\': 277 end -= 1 278 if end == start: 279 # must have been a long line of backslashes 280 end = start + limit - (limit % 2) - 4 281 break 282 chunks.append(s[start:end]) 283 start = end 284 return '""'.join(chunks) 285 286def encode_pyunicode_string(s): 287 """Create Py_UNICODE[] representation of a given unicode string. 288 """ 289 s = map(ord, s) + [0] 290 291 if sys.maxunicode >= 0x10000: # Wide build or Py3.3 292 utf16, utf32 = [], s 293 for code_point in s: 294 if code_point >= 0x10000: # outside of BMP 295 high, low = divmod(code_point - 0x10000, 1024) 296 utf16.append(high + 0xD800) 297 utf16.append(low + 0xDC00) 298 else: 299 utf16.append(code_point) 300 else: 301 utf16, utf32 = s, [] 302 for code_unit in s: 303 if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: 304 high, low = utf32[-1], code_unit 305 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 306 else: 307 utf32.append(code_unit) 308 309 if utf16 == utf32: 310 utf16 = [] 311 return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32)) 312