• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
22
23Table generation:
24(c) Copyright Marc-Andre Lemburg, 2005.
25    Licensed to PSF under a Contributor Agreement.
26
27"""#"
28
29import re, os, marshal, codecs
30
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
35UNI_UNDEFINED = chr(0xFFFE)
36
37# Placeholder for a missing code point
38MISSING_CODE = -1
39
40mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41                   r'\s+'
42                   r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43                   r'\s*'
44                   r'(#.+)?')
45
46def parsecodes(codes, len=len, range=range):
47
48    """ Converts code combinations to either a single code integer
49        or a tuple of integers.
50
51        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52        ignored.
53
54        Empty codes or illegal ones are returned as None.
55
56    """
57    if not codes:
58        return MISSING_CODE
59    l = codes.split('+')
60    if len(l) == 1:
61        return int(l[0],16)
62    for i in range(len(l)):
63        try:
64            l[i] = int(l[i],16)
65        except ValueError:
66            l[i] = MISSING_CODE
67    l = [x for x in l if x != MISSING_CODE]
68    if len(l) == 1:
69        return l[0]
70    else:
71        return tuple(l)
72
73def readmap(filename):
74
75    with open(filename) as f:
76        lines = f.readlines()
77    enc2uni = {}
78    identity = []
79    unmapped = list(range(256))
80
81    # UTC mapping tables per convention don't include the identity
82    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
83    # explicitly mapped to different characters or undefined
84    for i in list(range(32)) + [127]:
85        identity.append(i)
86        unmapped.remove(i)
87        enc2uni[i] = (i, 'CONTROL CHARACTER')
88
89    for line in lines:
90        line = line.strip()
91        if not line or line[0] == '#':
92            continue
93        m = mapRE.match(line)
94        if not m:
95            #print '* not matched: %s' % repr(line)
96            continue
97        enc,uni,comment = m.groups()
98        enc = parsecodes(enc)
99        uni = parsecodes(uni)
100        if comment is None:
101            comment = ''
102        else:
103            comment = comment[1:].strip()
104        if not isinstance(enc, tuple) and enc < 256:
105            if enc in unmapped:
106                unmapped.remove(enc)
107            if enc == uni:
108                identity.append(enc)
109            enc2uni[enc] = (uni,comment)
110        else:
111            enc2uni[enc] = (uni,comment)
112
113    # If there are more identity-mapped entries than unmapped entries,
114    # it pays to generate an identity dictionary first, and add explicit
115    # mappings to None for the rest
116    if len(identity) >= len(unmapped):
117        for enc in unmapped:
118            enc2uni[enc] = (MISSING_CODE, "")
119        enc2uni['IDENTITY'] = 256
120
121    return enc2uni
122
123def hexrepr(t, precision=4):
124
125    if t is None:
126        return 'None'
127    try:
128        len(t)
129    except TypeError:
130        return '0x%0*X' % (precision, t)
131    try:
132        return '(' + ', '.join(['0x%0*X' % (precision, item)
133                                for item in t]) + ')'
134    except TypeError as why:
135        print('* failed to convert %r: %s' % (t, why))
136        raise
137
138def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
139
140    l = []
141    append = l.append
142    if "IDENTITY" in map:
143        append("%s = codecs.make_identity_dict(range(%d))" %
144               (varname, map["IDENTITY"]))
145        append("%s.update({" % varname)
146        splits = 1
147        del map["IDENTITY"]
148        identity = 1
149    else:
150        append("%s = {" % varname)
151        splits = 0
152        identity = 0
153
154    mappings = sorted(map.items())
155    i = 0
156    key_precision, value_precision = precisions
157    for mapkey, mapvalue in mappings:
158        mapcomment = ''
159        if isinstance(mapkey, tuple):
160            (mapkey, mapcomment) = mapkey
161        if isinstance(mapvalue, tuple):
162            (mapvalue, mapcomment) = mapvalue
163        if mapkey is None:
164            continue
165        if (identity and
166            mapkey == mapvalue and
167            mapkey < 256):
168            # No need to include identity mappings, since these
169            # are already set for the first 256 code points.
170            continue
171        key = hexrepr(mapkey, key_precision)
172        value = hexrepr(mapvalue, value_precision)
173        if mapcomment and comments:
174            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
175        else:
176            append('    %s: %s,' % (key, value))
177        i += 1
178        if i == 4096:
179            # Split the definition into parts to that the Python
180            # parser doesn't dump core
181            if splits == 0:
182                append('}')
183            else:
184                append('})')
185            append('%s.update({' % varname)
186            i = 0
187            splits = splits + 1
188    if splits == 0:
189        append('}')
190    else:
191        append('})')
192
193    return l
194
195def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197    l = []
198    append = l.append
199    append('%s = (' % varname)
200
201    # Analyze map and create table dict
202    mappings = sorted(map.items())
203    table = {}
204    maxkey = 255
205    if 'IDENTITY' in map:
206        for key in range(256):
207            table[key] = (key, '')
208        del map['IDENTITY']
209    for mapkey, mapvalue in mappings:
210        mapcomment = ''
211        if isinstance(mapkey, tuple):
212            (mapkey, mapcomment) = mapkey
213        if isinstance(mapvalue, tuple):
214            (mapvalue, mapcomment) = mapvalue
215        if mapkey == MISSING_CODE:
216            continue
217        table[mapkey] = (mapvalue, mapcomment)
218        if mapkey > maxkey:
219            maxkey = mapkey
220    if maxkey > MAX_TABLE_SIZE:
221        # Table too large
222        return None
223
224    # Create table code
225    maxchar = 0
226    for key in range(maxkey + 1):
227        if key not in table:
228            mapvalue = MISSING_CODE
229            mapcomment = 'UNDEFINED'
230        else:
231            mapvalue, mapcomment = table[key]
232        if mapvalue == MISSING_CODE:
233            mapchar = UNI_UNDEFINED
234        else:
235            if isinstance(mapvalue, tuple):
236                # 1-n mappings not supported
237                return None
238            else:
239                mapchar = chr(mapvalue)
240        maxchar = max(maxchar, ord(mapchar))
241        if mapcomment and comments:
242            append('    %a \t#  %s -> %s' % (mapchar,
243                                            hexrepr(key, key_precision),
244                                            mapcomment))
245        else:
246            append('    %a' % mapchar)
247
248    if maxchar < 256:
249        append('    %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
250    append(')')
251    return l
252
253def codegen(name, map, encodingname, comments=1):
254
255    """ Returns Python source for the given map.
256
257        Comments are included in the source, if comments is true (default).
258
259    """
260    # Generate code
261    decoding_map_code = python_mapdef_code(
262        'decoding_map',
263        map,
264        comments=comments)
265    decoding_table_code = python_tabledef_code(
266        'decoding_table',
267        map,
268        comments=comments)
269    encoding_map_code = python_mapdef_code(
270        'encoding_map',
271        codecs.make_encoding_map(map),
272        comments=comments,
273        precisions=(4, 2))
274
275    if decoding_table_code:
276        suffix = 'table'
277    else:
278        suffix = 'map'
279
280    l = [
281        '''\
282""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
283
284"""#"
285
286import codecs
287
288### Codec APIs
289
290class Codec(codecs.Codec):
291
292    def encode(self, input, errors='strict'):
293        return codecs.charmap_encode(input, errors, encoding_%s)
294
295    def decode(self, input, errors='strict'):
296        return codecs.charmap_decode(input, errors, decoding_%s)
297''' % (encodingname, name, suffix, suffix)]
298    l.append('''\
299class IncrementalEncoder(codecs.IncrementalEncoder):
300    def encode(self, input, final=False):
301        return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
302
303class IncrementalDecoder(codecs.IncrementalDecoder):
304    def decode(self, input, final=False):
305        return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
306        (suffix, suffix))
307
308    l.append('''
309class StreamWriter(Codec, codecs.StreamWriter):
310    pass
311
312class StreamReader(Codec, codecs.StreamReader):
313    pass
314
315### encodings module API
316
317def getregentry():
318    return codecs.CodecInfo(
319        name=%r,
320        encode=Codec().encode,
321        decode=Codec().decode,
322        incrementalencoder=IncrementalEncoder,
323        incrementaldecoder=IncrementalDecoder,
324        streamreader=StreamReader,
325        streamwriter=StreamWriter,
326    )
327''' % encodingname.replace('_', '-'))
328
329    # Add decoding table or map (with preference to the table)
330    if not decoding_table_code:
331        l.append('''
332### Decoding Map
333''')
334        l.extend(decoding_map_code)
335    else:
336        l.append('''
337### Decoding Table
338''')
339        l.extend(decoding_table_code)
340
341    # Add encoding map
342    if decoding_table_code:
343        l.append('''
344### Encoding table
345encoding_table = codecs.charmap_build(decoding_table)
346''')
347    else:
348        l.append('''
349### Encoding Map
350''')
351        l.extend(encoding_map_code)
352
353    # Final new-line
354    l.append('')
355
356    return '\n'.join(l).expandtabs()
357
358def pymap(name,map,pyfile,encodingname,comments=1):
359
360    code = codegen(name,map,encodingname,comments)
361    with open(pyfile,'w') as f:
362        f.write(code)
363
364def marshalmap(name,map,marshalfile):
365
366    d = {}
367    for e,(u,c) in map.items():
368        d[e] = (u,c)
369    with open(marshalfile,'wb') as f:
370        marshal.dump(d,f)
371
372def convertdir(dir, dirprefix='', nameprefix='', comments=1):
373
374    mapnames = os.listdir(dir)
375    for mapname in mapnames:
376        mappathname = os.path.join(dir, mapname)
377        if not os.path.isfile(mappathname):
378            continue
379        name = os.path.split(mapname)[1]
380        name = name.replace('-','_')
381        name = name.split('.')[0]
382        name = name.lower()
383        name = nameprefix + name
384        codefile = name + '.py'
385        marshalfile = name + '.mapping'
386        print('converting %s to %s and %s' % (mapname,
387                                              dirprefix + codefile,
388                                              dirprefix + marshalfile))
389        try:
390            map = readmap(os.path.join(dir,mapname))
391            if not map:
392                print('* map is empty; skipping')
393            else:
394                pymap(mappathname, map, dirprefix + codefile,name,comments)
395                marshalmap(mappathname, map, dirprefix + marshalfile)
396        except ValueError as why:
397            print('* conversion failed: %s' % why)
398            raise
399
400def rewritepythondir(dir, dirprefix='', comments=1):
401
402    mapnames = os.listdir(dir)
403    for mapname in mapnames:
404        if not mapname.endswith('.mapping'):
405            continue
406        name = mapname[:-len('.mapping')]
407        codefile = name + '.py'
408        print('converting %s to %s' % (mapname,
409                                       dirprefix + codefile))
410        try:
411            with open(os.path.join(dir, mapname), 'rb') as f:
412                map = marshal.load(f)
413            if not map:
414                print('* map is empty; skipping')
415            else:
416                pymap(mapname, map, dirprefix + codefile,name,comments)
417        except ValueError as why:
418            print('* conversion failed: %s' % why)
419
420if __name__ == '__main__':
421
422    import sys
423    if 1:
424        convertdir(*sys.argv[1:])
425    else:
426        rewritepythondir(*sys.argv[1:])
427