1""" Unicode Mapping Parser and Codec Generator. 2 3This script parses Unicode mapping files as available from the Unicode 4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec 5modules from them. The codecs use the standard character mapping codec 6to actually apply the mapping. 7 8Synopsis: gencodec.py dir codec_prefix 9 10All files in dir are scanned and those producing non-empty mappings 11will be written to <codec_prefix><mapname>.py with <mapname> being the 12first part of the map's filename ('a' in a.b.c.txt) converted to 13lowercase with hyphens replaced by underscores. 14 15The tool also writes marshalled versions of the mapping tables to the 16same location (with .mapping extension). 17 18Written by Marc-Andre Lemburg (mal@lemburg.com). 19 20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 21(c) Copyright Guido van Rossum, 2000. 22 23Table generation: 24(c) Copyright Marc-Andre Lemburg, 2005. 25 Licensed to PSF under a Contributor Agreement. 26 27"""#" 28 29import re, os, marshal, codecs 30 31# Maximum allowed size of charmap tables 32MAX_TABLE_SIZE = 8192 33 34# Standard undefined Unicode code point 35UNI_UNDEFINED = chr(0xFFFE) 36 37# Placeholder for a missing code point 38MISSING_CODE = -1 39 40mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)' 41 r'\s+' 42 r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' 43 r'\s*' 44 r'(#.+)?') 45 46def parsecodes(codes, len=len, range=range): 47 48 """ Converts code combinations to either a single code integer 49 or a tuple of integers. 50 51 meta-codes (in angular brackets, e.g. <LR> and <RL>) are 52 ignored. 53 54 Empty codes or illegal ones are returned as None. 55 56 """ 57 if not codes: 58 return MISSING_CODE 59 l = codes.split('+') 60 if len(l) == 1: 61 return int(l[0],16) 62 for i in range(len(l)): 63 try: 64 l[i] = int(l[i],16) 65 except ValueError: 66 l[i] = MISSING_CODE 67 l = [x for x in l if x != MISSING_CODE] 68 if len(l) == 1: 69 return l[0] 70 else: 71 return tuple(l) 72 73def readmap(filename): 74 75 with open(filename) as f: 76 lines = f.readlines() 77 enc2uni = {} 78 identity = [] 79 unmapped = list(range(256)) 80 81 # UTC mapping tables per convention don't include the identity 82 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are 83 # explicitly mapped to different characters or undefined 84 for i in list(range(32)) + [127]: 85 identity.append(i) 86 unmapped.remove(i) 87 enc2uni[i] = (i, 'CONTROL CHARACTER') 88 89 for line in lines: 90 line = line.strip() 91 if not line or line[0] == '#': 92 continue 93 m = mapRE.match(line) 94 if not m: 95 #print '* not matched: %s' % repr(line) 96 continue 97 enc,uni,comment = m.groups() 98 enc = parsecodes(enc) 99 uni = parsecodes(uni) 100 if comment is None: 101 comment = '' 102 else: 103 comment = comment[1:].strip() 104 if not isinstance(enc, tuple) and enc < 256: 105 if enc in unmapped: 106 unmapped.remove(enc) 107 if enc == uni: 108 identity.append(enc) 109 enc2uni[enc] = (uni,comment) 110 else: 111 enc2uni[enc] = (uni,comment) 112 113 # If there are more identity-mapped entries than unmapped entries, 114 # it pays to generate an identity dictionary first, and add explicit 115 # mappings to None for the rest 116 if len(identity) >= len(unmapped): 117 for enc in unmapped: 118 enc2uni[enc] = (MISSING_CODE, "") 119 enc2uni['IDENTITY'] = 256 120 121 return enc2uni 122 123def hexrepr(t, precision=4): 124 125 if t is None: 126 return 'None' 127 try: 128 len(t) 129 except TypeError: 130 return '0x%0*X' % (precision, t) 131 try: 132 return '(' + ', '.join(['0x%0*X' % (precision, item) 133 for item in t]) + ')' 134 except TypeError as why: 135 print('* failed to convert %r: %s' % (t, why)) 136 raise 137 138def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): 139 140 l = [] 141 append = l.append 142 if "IDENTITY" in map: 143 append("%s = codecs.make_identity_dict(range(%d))" % 144 (varname, map["IDENTITY"])) 145 append("%s.update({" % varname) 146 splits = 1 147 del map["IDENTITY"] 148 identity = 1 149 else: 150 append("%s = {" % varname) 151 splits = 0 152 identity = 0 153 154 mappings = sorted(map.items()) 155 i = 0 156 key_precision, value_precision = precisions 157 for mapkey, mapvalue in mappings: 158 mapcomment = '' 159 if isinstance(mapkey, tuple): 160 (mapkey, mapcomment) = mapkey 161 if isinstance(mapvalue, tuple): 162 (mapvalue, mapcomment) = mapvalue 163 if mapkey is None: 164 continue 165 if (identity and 166 mapkey == mapvalue and 167 mapkey < 256): 168 # No need to include identity mappings, since these 169 # are already set for the first 256 code points. 170 continue 171 key = hexrepr(mapkey, key_precision) 172 value = hexrepr(mapvalue, value_precision) 173 if mapcomment and comments: 174 append(' %s: %s,\t# %s' % (key, value, mapcomment)) 175 else: 176 append(' %s: %s,' % (key, value)) 177 i += 1 178 if i == 4096: 179 # Split the definition into parts to that the Python 180 # parser doesn't dump core 181 if splits == 0: 182 append('}') 183 else: 184 append('})') 185 append('%s.update({' % varname) 186 i = 0 187 splits = splits + 1 188 if splits == 0: 189 append('}') 190 else: 191 append('})') 192 193 return l 194 195def python_tabledef_code(varname, map, comments=1, key_precision=2): 196 197 l = [] 198 append = l.append 199 append('%s = (' % varname) 200 201 # Analyze map and create table dict 202 mappings = sorted(map.items()) 203 table = {} 204 maxkey = 255 205 if 'IDENTITY' in map: 206 for key in range(256): 207 table[key] = (key, '') 208 del map['IDENTITY'] 209 for mapkey, mapvalue in mappings: 210 mapcomment = '' 211 if isinstance(mapkey, tuple): 212 (mapkey, mapcomment) = mapkey 213 if isinstance(mapvalue, tuple): 214 (mapvalue, mapcomment) = mapvalue 215 if mapkey == MISSING_CODE: 216 continue 217 table[mapkey] = (mapvalue, mapcomment) 218 if mapkey > maxkey: 219 maxkey = mapkey 220 if maxkey > MAX_TABLE_SIZE: 221 # Table too large 222 return None 223 224 # Create table code 225 maxchar = 0 226 for key in range(maxkey + 1): 227 if key not in table: 228 mapvalue = MISSING_CODE 229 mapcomment = 'UNDEFINED' 230 else: 231 mapvalue, mapcomment = table[key] 232 if mapvalue == MISSING_CODE: 233 mapchar = UNI_UNDEFINED 234 else: 235 if isinstance(mapvalue, tuple): 236 # 1-n mappings not supported 237 return None 238 else: 239 mapchar = chr(mapvalue) 240 maxchar = max(maxchar, ord(mapchar)) 241 if mapcomment and comments: 242 append(' %a \t# %s -> %s' % (mapchar, 243 hexrepr(key, key_precision), 244 mapcomment)) 245 else: 246 append(' %a' % mapchar) 247 248 if maxchar < 256: 249 append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED) 250 append(')') 251 return l 252 253def codegen(name, map, encodingname, comments=1): 254 255 """ Returns Python source for the given map. 256 257 Comments are included in the source, if comments is true (default). 258 259 """ 260 # Generate code 261 decoding_map_code = python_mapdef_code( 262 'decoding_map', 263 map, 264 comments=comments) 265 decoding_table_code = python_tabledef_code( 266 'decoding_table', 267 map, 268 comments=comments) 269 encoding_map_code = python_mapdef_code( 270 'encoding_map', 271 codecs.make_encoding_map(map), 272 comments=comments, 273 precisions=(4, 2)) 274 275 if decoding_table_code: 276 suffix = 'table' 277 else: 278 suffix = 'map' 279 280 l = [ 281 '''\ 282""" Python Character Mapping Codec %s generated from '%s' with gencodec.py. 283 284"""#" 285 286import codecs 287 288### Codec APIs 289 290class Codec(codecs.Codec): 291 292 def encode(self, input, errors='strict'): 293 return codecs.charmap_encode(input, errors, encoding_%s) 294 295 def decode(self, input, errors='strict'): 296 return codecs.charmap_decode(input, errors, decoding_%s) 297''' % (encodingname, name, suffix, suffix)] 298 l.append('''\ 299class IncrementalEncoder(codecs.IncrementalEncoder): 300 def encode(self, input, final=False): 301 return codecs.charmap_encode(input, self.errors, encoding_%s)[0] 302 303class IncrementalDecoder(codecs.IncrementalDecoder): 304 def decode(self, input, final=False): 305 return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' % 306 (suffix, suffix)) 307 308 l.append(''' 309class StreamWriter(Codec, codecs.StreamWriter): 310 pass 311 312class StreamReader(Codec, codecs.StreamReader): 313 pass 314 315### encodings module API 316 317def getregentry(): 318 return codecs.CodecInfo( 319 name=%r, 320 encode=Codec().encode, 321 decode=Codec().decode, 322 incrementalencoder=IncrementalEncoder, 323 incrementaldecoder=IncrementalDecoder, 324 streamreader=StreamReader, 325 streamwriter=StreamWriter, 326 ) 327''' % encodingname.replace('_', '-')) 328 329 # Add decoding table or map (with preference to the table) 330 if not decoding_table_code: 331 l.append(''' 332### Decoding Map 333''') 334 l.extend(decoding_map_code) 335 else: 336 l.append(''' 337### Decoding Table 338''') 339 l.extend(decoding_table_code) 340 341 # Add encoding map 342 if decoding_table_code: 343 l.append(''' 344### Encoding table 345encoding_table = codecs.charmap_build(decoding_table) 346''') 347 else: 348 l.append(''' 349### Encoding Map 350''') 351 l.extend(encoding_map_code) 352 353 # Final new-line 354 l.append('') 355 356 return '\n'.join(l).expandtabs() 357 358def pymap(name,map,pyfile,encodingname,comments=1): 359 360 code = codegen(name,map,encodingname,comments) 361 with open(pyfile,'w') as f: 362 f.write(code) 363 364def marshalmap(name,map,marshalfile): 365 366 d = {} 367 for e,(u,c) in map.items(): 368 d[e] = (u,c) 369 with open(marshalfile,'wb') as f: 370 marshal.dump(d,f) 371 372def convertdir(dir, dirprefix='', nameprefix='', comments=1): 373 374 mapnames = os.listdir(dir) 375 for mapname in mapnames: 376 mappathname = os.path.join(dir, mapname) 377 if not os.path.isfile(mappathname): 378 continue 379 name = os.path.split(mapname)[1] 380 name = name.replace('-','_') 381 name = name.split('.')[0] 382 name = name.lower() 383 name = nameprefix + name 384 codefile = name + '.py' 385 marshalfile = name + '.mapping' 386 print('converting %s to %s and %s' % (mapname, 387 dirprefix + codefile, 388 dirprefix + marshalfile)) 389 try: 390 map = readmap(os.path.join(dir,mapname)) 391 if not map: 392 print('* map is empty; skipping') 393 else: 394 pymap(mappathname, map, dirprefix + codefile,name,comments) 395 marshalmap(mappathname, map, dirprefix + marshalfile) 396 except ValueError as why: 397 print('* conversion failed: %s' % why) 398 raise 399 400def rewritepythondir(dir, dirprefix='', comments=1): 401 402 mapnames = os.listdir(dir) 403 for mapname in mapnames: 404 if not mapname.endswith('.mapping'): 405 continue 406 name = mapname[:-len('.mapping')] 407 codefile = name + '.py' 408 print('converting %s to %s' % (mapname, 409 dirprefix + codefile)) 410 try: 411 with open(os.path.join(dir, mapname), 'rb') as f: 412 map = marshal.load(f) 413 if not map: 414 print('* map is empty; skipping') 415 else: 416 pymap(mapname, map, dirprefix + codefile,name,comments) 417 except ValueError as why: 418 print('* conversion failed: %s' % why) 419 420if __name__ == '__main__': 421 422 import sys 423 if 1: 424 convertdir(*sys.argv[1:]) 425 else: 426 rewritepythondir(*sys.argv[1:]) 427