1""" Unicode Mapping Parser and Codec Generator. 2 3This script parses Unicode mapping files as available from the Unicode 4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec 5modules from them. The codecs use the standard character mapping codec 6to actually apply the mapping. 7 8Synopsis: gencodec.py dir codec_prefix 9 10All files in dir are scanned and those producing non-empty mappings 11will be written to <codec_prefix><mapname>.py with <mapname> being the 12first part of the map's filename ('a' in a.b.c.txt) converted to 13lowercase with hyphens replaced by underscores. 14 15The tool also writes marshalled versions of the mapping tables to the 16same location (with .mapping extension). 17 18Written by Marc-Andre Lemburg (mal@lemburg.com). 19 20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 21(c) Copyright Guido van Rossum, 2000. 22 23Table generation: 24(c) Copyright Marc-Andre Lemburg, 2005. 25 Licensed to PSF under a Contributor Agreement. 26 27"""#" 28 29import re, os, marshal, codecs 30 31# Maximum allowed size of charmap tables 32MAX_TABLE_SIZE = 8192 33 34# Standard undefined Unicode code point 35UNI_UNDEFINED = unichr(0xFFFE) 36 37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' 38 '\s+' 39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' 40 '\s*' 41 '(#.+)?') 42 43def parsecodes(codes, len=len, range=range): 44 45 """ Converts code combinations to either a single code integer 46 or a tuple of integers. 47 48 meta-codes (in angular brackets, e.g. <LR> and <RL>) are 49 ignored. 50 51 Empty codes or illegal ones are returned as None. 52 53 """ 54 if not codes: 55 return None 56 l = codes.split('+') 57 if len(l) == 1: 58 return int(l[0],16) 59 for i in range(len(l)): 60 try: 61 l[i] = int(l[i],16) 62 except ValueError: 63 l[i] = None 64 l = [x for x in l if x is not None] 65 if len(l) == 1: 66 return l[0] 67 else: 68 return tuple(l) 69 70def readmap(filename): 71 72 f = open(filename,'r') 73 lines = f.readlines() 74 f.close() 75 enc2uni = {} 76 identity = [] 77 unmapped = range(256) 78 79 # UTC mapping tables per convention don't include the identity 80 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are 81 # explicitly mapped to different characters or undefined 82 for i in range(32) + [127]: 83 identity.append(i) 84 unmapped.remove(i) 85 enc2uni[i] = (i, 'CONTROL CHARACTER') 86 87 for line in lines: 88 line = line.strip() 89 if not line or line[0] == '#': 90 continue 91 m = mapRE.match(line) 92 if not m: 93 #print '* not matched: %s' % repr(line) 94 continue 95 enc,uni,comment = m.groups() 96 enc = parsecodes(enc) 97 uni = parsecodes(uni) 98 if comment is None: 99 comment = '' 100 else: 101 comment = comment[1:].strip() 102 if enc < 256: 103 if enc in unmapped: 104 unmapped.remove(enc) 105 if enc == uni: 106 identity.append(enc) 107 enc2uni[enc] = (uni,comment) 108 else: 109 enc2uni[enc] = (uni,comment) 110 111 # If there are more identity-mapped entries than unmapped entries, 112 # it pays to generate an identity dictionary first, and add explicit 113 # mappings to None for the rest 114 if len(identity) >= len(unmapped): 115 for enc in unmapped: 116 enc2uni[enc] = (None, "") 117 enc2uni['IDENTITY'] = 256 118 119 return enc2uni 120 121def hexrepr(t, precision=4): 122 123 if t is None: 124 return 'None' 125 try: 126 len(t) 127 except: 128 return '0x%0*X' % (precision, t) 129 try: 130 return '(' + ', '.join(['0x%0*X' % (precision, item) 131 for item in t]) + ')' 132 except TypeError, why: 133 print '* failed to convert %r: %s' % (t, why) 134 raise 135 136def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): 137 138 l = [] 139 append = l.append 140 if "IDENTITY" in map: 141 append("%s = codecs.make_identity_dict(range(%d))" % 142 (varname, map["IDENTITY"])) 143 append("%s.update({" % varname) 144 splits = 1 145 del map["IDENTITY"] 146 identity = 1 147 else: 148 append("%s = {" % varname) 149 splits = 0 150 identity = 0 151 152 mappings = sorted(map.items()) 153 i = 0 154 key_precision, value_precision = precisions 155 for mapkey, mapvalue in mappings: 156 mapcomment = '' 157 if isinstance(mapkey, tuple): 158 (mapkey, mapcomment) = mapkey 159 if isinstance(mapvalue, tuple): 160 (mapvalue, mapcomment) = mapvalue 161 if mapkey is None: 162 continue 163 if (identity and 164 mapkey == mapvalue and 165 mapkey < 256): 166 # No need to include identity mappings, since these 167 # are already set for the first 256 code points. 168 continue 169 key = hexrepr(mapkey, key_precision) 170 value = hexrepr(mapvalue, value_precision) 171 if mapcomment and comments: 172 append(' %s: %s,\t# %s' % (key, value, mapcomment)) 173 else: 174 append(' %s: %s,' % (key, value)) 175 i += 1 176 if i == 4096: 177 # Split the definition into parts to that the Python 178 # parser doesn't dump core 179 if splits == 0: 180 append('}') 181 else: 182 append('})') 183 append('%s.update({' % varname) 184 i = 0 185 splits = splits + 1 186 if splits == 0: 187 append('}') 188 else: 189 append('})') 190 191 return l 192 193def python_tabledef_code(varname, map, comments=1, key_precision=2): 194 195 l = [] 196 append = l.append 197 append('%s = (' % varname) 198 199 # Analyze map and create table dict 200 mappings = sorted(map.items()) 201 table = {} 202 maxkey = 0 203 if 'IDENTITY' in map: 204 for key in range(256): 205 table[key] = (key, '') 206 maxkey = 255 207 del map['IDENTITY'] 208 for mapkey, mapvalue in mappings: 209 mapcomment = '' 210 if isinstance(mapkey, tuple): 211 (mapkey, mapcomment) = mapkey 212 if isinstance(mapvalue, tuple): 213 (mapvalue, mapcomment) = mapvalue 214 if mapkey is None: 215 continue 216 table[mapkey] = (mapvalue, mapcomment) 217 if mapkey > maxkey: 218 maxkey = mapkey 219 if maxkey > MAX_TABLE_SIZE: 220 # Table too large 221 return None 222 223 # Create table code 224 for key in range(maxkey + 1): 225 if key not in table: 226 mapvalue = None 227 mapcomment = 'UNDEFINED' 228 else: 229 mapvalue, mapcomment = table[key] 230 if mapvalue is None: 231 mapchar = UNI_UNDEFINED 232 else: 233 if isinstance(mapvalue, tuple): 234 # 1-n mappings not supported 235 return None 236 else: 237 mapchar = unichr(mapvalue) 238 if mapcomment and comments: 239 append(' %r\t# %s -> %s' % (mapchar, 240 hexrepr(key, key_precision), 241 mapcomment)) 242 else: 243 append(' %r' % mapchar) 244 245 append(')') 246 return l 247 248def codegen(name, map, encodingname, comments=1): 249 250 """ Returns Python source for the given map. 251 252 Comments are included in the source, if comments is true (default). 253 254 """ 255 # Generate code 256 decoding_map_code = python_mapdef_code( 257 'decoding_map', 258 map, 259 comments=comments) 260 decoding_table_code = python_tabledef_code( 261 'decoding_table', 262 map, 263 comments=comments) 264 encoding_map_code = python_mapdef_code( 265 'encoding_map', 266 codecs.make_encoding_map(map), 267 comments=comments, 268 precisions=(4, 2)) 269 270 if decoding_table_code: 271 suffix = 'table' 272 else: 273 suffix = 'map' 274 275 l = [ 276 '''\ 277""" Python Character Mapping Codec %s generated from '%s' with gencodec.py. 278 279"""#" 280 281import codecs 282 283### Codec APIs 284 285class Codec(codecs.Codec): 286 287 def encode(self,input,errors='strict'): 288 return codecs.charmap_encode(input,errors,encoding_%s) 289 290 def decode(self,input,errors='strict'): 291 return codecs.charmap_decode(input,errors,decoding_%s) 292''' % (encodingname, name, suffix, suffix)] 293 l.append('''\ 294class IncrementalEncoder(codecs.IncrementalEncoder): 295 def encode(self, input, final=False): 296 return codecs.charmap_encode(input,self.errors,encoding_%s)[0] 297 298class IncrementalDecoder(codecs.IncrementalDecoder): 299 def decode(self, input, final=False): 300 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' % 301 (suffix, suffix)) 302 303 l.append(''' 304class StreamWriter(Codec,codecs.StreamWriter): 305 pass 306 307class StreamReader(Codec,codecs.StreamReader): 308 pass 309 310### encodings module API 311 312def getregentry(): 313 return codecs.CodecInfo( 314 name=%r, 315 encode=Codec().encode, 316 decode=Codec().decode, 317 incrementalencoder=IncrementalEncoder, 318 incrementaldecoder=IncrementalDecoder, 319 streamreader=StreamReader, 320 streamwriter=StreamWriter, 321 ) 322''' % encodingname.replace('_', '-')) 323 324 # Add decoding table or map (with preference to the table) 325 if not decoding_table_code: 326 l.append(''' 327### Decoding Map 328''') 329 l.extend(decoding_map_code) 330 else: 331 l.append(''' 332### Decoding Table 333''') 334 l.extend(decoding_table_code) 335 336 # Add encoding map 337 if decoding_table_code: 338 l.append(''' 339### Encoding table 340encoding_table=codecs.charmap_build(decoding_table) 341''') 342 else: 343 l.append(''' 344### Encoding Map 345''') 346 l.extend(encoding_map_code) 347 348 # Final new-line 349 l.append('') 350 351 return '\n'.join(l).expandtabs() 352 353def pymap(name,map,pyfile,encodingname,comments=1): 354 355 code = codegen(name,map,encodingname,comments) 356 f = open(pyfile,'w') 357 f.write(code) 358 f.close() 359 360def marshalmap(name,map,marshalfile): 361 362 d = {} 363 for e,(u,c) in map.items(): 364 d[e] = (u,c) 365 f = open(marshalfile,'wb') 366 marshal.dump(d,f) 367 f.close() 368 369def convertdir(dir, dirprefix='', nameprefix='', comments=1): 370 371 mapnames = os.listdir(dir) 372 for mapname in mapnames: 373 mappathname = os.path.join(dir, mapname) 374 if not os.path.isfile(mappathname): 375 continue 376 name = os.path.split(mapname)[1] 377 name = name.replace('-','_') 378 name = name.split('.')[0] 379 name = name.lower() 380 name = nameprefix + name 381 codefile = name + '.py' 382 marshalfile = name + '.mapping' 383 print 'converting %s to %s and %s' % (mapname, 384 dirprefix + codefile, 385 dirprefix + marshalfile) 386 try: 387 map = readmap(os.path.join(dir,mapname)) 388 if not map: 389 print '* map is empty; skipping' 390 else: 391 pymap(mappathname, map, dirprefix + codefile,name,comments) 392 marshalmap(mappathname, map, dirprefix + marshalfile) 393 except ValueError, why: 394 print '* conversion failed: %s' % why 395 raise 396 397def rewritepythondir(dir, dirprefix='', comments=1): 398 399 mapnames = os.listdir(dir) 400 for mapname in mapnames: 401 if not mapname.endswith('.mapping'): 402 continue 403 name = mapname[:-len('.mapping')] 404 codefile = name + '.py' 405 print 'converting %s to %s' % (mapname, 406 dirprefix + codefile) 407 try: 408 map = marshal.load(open(os.path.join(dir,mapname), 409 'rb')) 410 if not map: 411 print '* map is empty; skipping' 412 else: 413 pymap(mapname, map, dirprefix + codefile,name,comments) 414 except ValueError, why: 415 print '* conversion failed: %s' % why 416 417if __name__ == '__main__': 418 419 import sys 420 if 1: 421 convertdir(*sys.argv[1:]) 422 else: 423 rewritepythondir(*sys.argv[1:]) 424