• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# (re)generate unicode property and type databases
3#
4# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
6#
7# history:
8# 2000-09-24 fl   created (based on bits and pieces from unidb)
9# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
10# 2000-09-25 fl   added character type table
11# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12# 2000-11-03 fl   expand first/last ranges
13# 2001-01-19 fl   added character name tables (2.1)
14# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
15# 2002-09-11 wd   use string methods
16# 2002-10-18 mvl  update to Unicode 3.2
17# 2002-10-22 mvl  generate NFC tables
18# 2002-11-24 mvl  expand all ranges, sort names version-independently
19# 2002-11-25 mvl  add UNIDATA_VERSION
20# 2004-05-29 perky add east asian width information
21# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
22# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23# 2011-10-21 ezio add support for name aliases and named sequences
24# 2012-01    benjamin add full case mappings
25#
26# written by Fredrik Lundh (fredrik@pythonware.com)
27#
28
29import os
30import sys
31import zipfile
32
33from textwrap import dedent
34from functools import partial
35
36SCRIPT = sys.argv[0]
37VERSION = "3.3"
38
39# The Unicode Database
40# --------------------
41# When changing UCD version please update
42#   * Doc/library/stdtypes.rst, and
43#   * Doc/library/unicodedata.rst
44#   * Doc/reference/lexical_analysis.rst (two occurrences)
45UNIDATA_VERSION = "12.1.0"
46UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
49UNIHAN = "Unihan%s.zip"
50DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
51DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
52LINE_BREAK = "LineBreak%s.txt"
53NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
55SPECIAL_CASING = "SpecialCasing%s.txt"
56CASE_FOLDING = "CaseFolding%s.txt"
57
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
65NAMED_SEQUENCES_START = 0xF0200
66
67old_versions = ["3.2.0"]
68
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72    "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
76    "ON", "LRI", "RLI", "FSI", "PDI" ]
77
78EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
80MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
82# note: should match definitions in Objects/unicodectype.c
83ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
87LINEBREAK_MASK = 0x10
88SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
91XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
93PRINTABLE_MASK = 0x400
94NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
98
99# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101    ('3400', '4DB5'),
102    ('4E00', '9FEF'),
103    ('20000', '2A6D6'),
104    ('2A700', '2B734'),
105    ('2B740', '2B81D'),
106    ('2B820', '2CEA1'),
107    ('2CEB0', '2EBE0'),
108]
109
110
111def maketables(trace=0):
112
113    print("--- Reading", UNICODE_DATA % "", "...")
114
115    unicode = UnicodeData(UNIDATA_VERSION)
116
117    print(len(list(filter(None, unicode.table))), "characters")
118
119    for version in old_versions:
120        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
121        old_unicode = UnicodeData(version, cjk_check=False)
122        print(len(list(filter(None, old_unicode.table))), "characters")
123        merge_old_version(version, unicode, old_unicode)
124
125    makeunicodename(unicode, trace)
126    makeunicodedata(unicode, trace)
127    makeunicodetype(unicode, trace)
128
129
130# --------------------------------------------------------------------
131# unicode character properties
132
133def makeunicodedata(unicode, trace):
134
135    dummy = (0, 0, 0, 0, 0, 0)
136    table = [dummy]
137    cache = {0: dummy}
138    index = [0] * len(unicode.chars)
139
140    FILE = "Modules/unicodedata_db.h"
141
142    print("--- Preparing", FILE, "...")
143
144    # 1) database properties
145
146    for char in unicode.chars:
147        record = unicode.table[char]
148        if record:
149            # extract database properties
150            category = CATEGORY_NAMES.index(record[2])
151            combining = int(record[3])
152            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
153            mirrored = record[9] == "Y"
154            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
155            normalizationquickcheck = record[17]
156            item = (
157                category, combining, bidirectional, mirrored, eastasianwidth,
158                normalizationquickcheck
159                )
160            # add entry to index and item tables
161            i = cache.get(item)
162            if i is None:
163                cache[item] = i = len(table)
164                table.append(item)
165            index[char] = i
166
167    # 2) decomposition data
168
169    decomp_data = [0]
170    decomp_prefix = [""]
171    decomp_index = [0] * len(unicode.chars)
172    decomp_size = 0
173
174    comp_pairs = []
175    comp_first = [None] * len(unicode.chars)
176    comp_last = [None] * len(unicode.chars)
177
178    for char in unicode.chars:
179        record = unicode.table[char]
180        if record:
181            if record[5]:
182                decomp = record[5].split()
183                if len(decomp) > 19:
184                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
185                # prefix
186                if decomp[0][0] == "<":
187                    prefix = decomp.pop(0)
188                else:
189                    prefix = ""
190                try:
191                    i = decomp_prefix.index(prefix)
192                except ValueError:
193                    i = len(decomp_prefix)
194                    decomp_prefix.append(prefix)
195                prefix = i
196                assert prefix < 256
197                # content
198                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
199                # Collect NFC pairs
200                if not prefix and len(decomp) == 3 and \
201                   char not in unicode.exclusions and \
202                   unicode.table[decomp[1]][3] == "0":
203                    p, l, r = decomp
204                    comp_first[l] = 1
205                    comp_last[r] = 1
206                    comp_pairs.append((l,r,char))
207                try:
208                    i = decomp_data.index(decomp)
209                except ValueError:
210                    i = len(decomp_data)
211                    decomp_data.extend(decomp)
212                    decomp_size = decomp_size + len(decomp) * 2
213            else:
214                i = 0
215            decomp_index[char] = i
216
217    f = l = 0
218    comp_first_ranges = []
219    comp_last_ranges = []
220    prev_f = prev_l = None
221    for i in unicode.chars:
222        if comp_first[i] is not None:
223            comp_first[i] = f
224            f += 1
225            if prev_f is None:
226                prev_f = (i,i)
227            elif prev_f[1]+1 == i:
228                prev_f = prev_f[0],i
229            else:
230                comp_first_ranges.append(prev_f)
231                prev_f = (i,i)
232        if comp_last[i] is not None:
233            comp_last[i] = l
234            l += 1
235            if prev_l is None:
236                prev_l = (i,i)
237            elif prev_l[1]+1 == i:
238                prev_l = prev_l[0],i
239            else:
240                comp_last_ranges.append(prev_l)
241                prev_l = (i,i)
242    comp_first_ranges.append(prev_f)
243    comp_last_ranges.append(prev_l)
244    total_first = f
245    total_last = l
246
247    comp_data = [0]*(total_first*total_last)
248    for f,l,char in comp_pairs:
249        f = comp_first[f]
250        l = comp_last[l]
251        comp_data[f*total_last+l] = char
252
253    print(len(table), "unique properties")
254    print(len(decomp_prefix), "unique decomposition prefixes")
255    print(len(decomp_data), "unique decomposition entries:", end=' ')
256    print(decomp_size, "bytes")
257    print(total_first, "first characters in NFC")
258    print(total_last, "last characters in NFC")
259    print(len(comp_pairs), "NFC pairs")
260
261    print("--- Writing", FILE, "...")
262
263    with open(FILE, "w") as fp:
264        fprint = partial(print, file=fp)
265
266        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
267        fprint()
268        fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
269        fprint("/* a list of unique database records */")
270        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
271        for item in table:
272            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
273        fprint("};")
274        fprint()
275
276        fprint("/* Reindexing of NFC first characters. */")
277        fprint("#define TOTAL_FIRST",total_first)
278        fprint("#define TOTAL_LAST",total_last)
279        fprint("struct reindex{int start;short count,index;};")
280        fprint("static struct reindex nfc_first[] = {")
281        for start,end in comp_first_ranges:
282            fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
283        fprint("    {0,0,0}")
284        fprint("};\n")
285        fprint("static struct reindex nfc_last[] = {")
286        for start,end in comp_last_ranges:
287            fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
288        fprint("  {0,0,0}")
289        fprint("};\n")
290
291        # FIXME: <fl> the following tables could be made static, and
292        # the support code moved into unicodedatabase.c
293
294        fprint("/* string literals */")
295        fprint("const char *_PyUnicode_CategoryNames[] = {")
296        for name in CATEGORY_NAMES:
297            fprint("    \"%s\"," % name)
298        fprint("    NULL")
299        fprint("};")
300
301        fprint("const char *_PyUnicode_BidirectionalNames[] = {")
302        for name in BIDIRECTIONAL_NAMES:
303            fprint("    \"%s\"," % name)
304        fprint("    NULL")
305        fprint("};")
306
307        fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
308        for name in EASTASIANWIDTH_NAMES:
309            fprint("    \"%s\"," % name)
310        fprint("    NULL")
311        fprint("};")
312
313        fprint("static const char *decomp_prefix[] = {")
314        for name in decomp_prefix:
315            fprint("    \"%s\"," % name)
316        fprint("    NULL")
317        fprint("};")
318
319        # split record index table
320        index1, index2, shift = splitbins(index, trace)
321
322        fprint("/* index tables for the database records */")
323        fprint("#define SHIFT", shift)
324        Array("index1", index1).dump(fp, trace)
325        Array("index2", index2).dump(fp, trace)
326
327        # split decomposition index table
328        index1, index2, shift = splitbins(decomp_index, trace)
329
330        fprint("/* decomposition data */")
331        Array("decomp_data", decomp_data).dump(fp, trace)
332
333        fprint("/* index tables for the decomposition data */")
334        fprint("#define DECOMP_SHIFT", shift)
335        Array("decomp_index1", index1).dump(fp, trace)
336        Array("decomp_index2", index2).dump(fp, trace)
337
338        index, index2, shift = splitbins(comp_data, trace)
339        fprint("/* NFC pairs */")
340        fprint("#define COMP_SHIFT", shift)
341        Array("comp_index", index).dump(fp, trace)
342        Array("comp_data", index2).dump(fp, trace)
343
344        # Generate delta tables for old versions
345        for version, table, normalization in unicode.changed:
346            cversion = version.replace(".","_")
347            records = [table[0]]
348            cache = {table[0]:0}
349            index = [0] * len(table)
350            for i, record in enumerate(table):
351                try:
352                    index[i] = cache[record]
353                except KeyError:
354                    index[i] = cache[record] = len(records)
355                    records.append(record)
356            index1, index2, shift = splitbins(index, trace)
357            fprint("static const change_record change_records_%s[] = {" % cversion)
358            for record in records:
359                fprint("    { %s }," % ", ".join(map(str,record)))
360            fprint("};")
361            Array("changes_%s_index" % cversion, index1).dump(fp, trace)
362            Array("changes_%s_data" % cversion, index2).dump(fp, trace)
363            fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
364            fprint("{")
365            fprint("    int index;")
366            fprint("    if (n >= 0x110000) index = 0;")
367            fprint("    else {")
368            fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
369            fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
370                   (cversion, shift, ((1<<shift)-1)))
371            fprint("    }")
372            fprint("    return change_records_%s+index;" % cversion)
373            fprint("}\n")
374            fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
375            fprint("{")
376            fprint("    switch(n) {")
377            for k, v in normalization:
378                fprint("    case %s: return 0x%s;" % (hex(k), v))
379            fprint("    default: return 0;")
380            fprint("    }\n}\n")
381
382
383# --------------------------------------------------------------------
384# unicode character type tables
385
386def makeunicodetype(unicode, trace):
387
388    FILE = "Objects/unicodetype_db.h"
389
390    print("--- Preparing", FILE, "...")
391
392    # extract unicode types
393    dummy = (0, 0, 0, 0, 0, 0)
394    table = [dummy]
395    cache = {0: dummy}
396    index = [0] * len(unicode.chars)
397    numeric = {}
398    spaces = []
399    linebreaks = []
400    extra_casing = []
401
402    for char in unicode.chars:
403        record = unicode.table[char]
404        if record:
405            # extract database properties
406            category = record[2]
407            bidirectional = record[4]
408            properties = record[16]
409            flags = 0
410            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
411                flags |= ALPHA_MASK
412            if "Lowercase" in properties:
413                flags |= LOWER_MASK
414            if 'Line_Break' in properties or bidirectional == "B":
415                flags |= LINEBREAK_MASK
416                linebreaks.append(char)
417            if category == "Zs" or bidirectional in ("WS", "B", "S"):
418                flags |= SPACE_MASK
419                spaces.append(char)
420            if category == "Lt":
421                flags |= TITLE_MASK
422            if "Uppercase" in properties:
423                flags |= UPPER_MASK
424            if char == ord(" ") or category[0] not in ("C", "Z"):
425                flags |= PRINTABLE_MASK
426            if "XID_Start" in properties:
427                flags |= XID_START_MASK
428            if "XID_Continue" in properties:
429                flags |= XID_CONTINUE_MASK
430            if "Cased" in properties:
431                flags |= CASED_MASK
432            if "Case_Ignorable" in properties:
433                flags |= CASE_IGNORABLE_MASK
434            sc = unicode.special_casing.get(char)
435            cf = unicode.case_folding.get(char, [char])
436            if record[12]:
437                upper = int(record[12], 16)
438            else:
439                upper = char
440            if record[13]:
441                lower = int(record[13], 16)
442            else:
443                lower = char
444            if record[14]:
445                title = int(record[14], 16)
446            else:
447                title = upper
448            if sc is None and cf != [lower]:
449                sc = ([lower], [title], [upper])
450            if sc is None:
451                if upper == lower == title:
452                    upper = lower = title = 0
453                else:
454                    upper = upper - char
455                    lower = lower - char
456                    title = title - char
457                    assert (abs(upper) <= 2147483647 and
458                            abs(lower) <= 2147483647 and
459                            abs(title) <= 2147483647)
460            else:
461                # This happens either when some character maps to more than one
462                # character in uppercase, lowercase, or titlecase or the
463                # casefolded version of the character is different from the
464                # lowercase. The extra characters are stored in a different
465                # array.
466                flags |= EXTENDED_CASE_MASK
467                lower = len(extra_casing) | (len(sc[0]) << 24)
468                extra_casing.extend(sc[0])
469                if cf != sc[0]:
470                    lower |= len(cf) << 20
471                    extra_casing.extend(cf)
472                upper = len(extra_casing) | (len(sc[2]) << 24)
473                extra_casing.extend(sc[2])
474                # Title is probably equal to upper.
475                if sc[1] == sc[2]:
476                    title = upper
477                else:
478                    title = len(extra_casing) | (len(sc[1]) << 24)
479                    extra_casing.extend(sc[1])
480            # decimal digit, integer digit
481            decimal = 0
482            if record[6]:
483                flags |= DECIMAL_MASK
484                decimal = int(record[6])
485            digit = 0
486            if record[7]:
487                flags |= DIGIT_MASK
488                digit = int(record[7])
489            if record[8]:
490                flags |= NUMERIC_MASK
491                numeric.setdefault(record[8], []).append(char)
492            item = (
493                upper, lower, title, decimal, digit, flags
494                )
495            # add entry to index and item tables
496            i = cache.get(item)
497            if i is None:
498                cache[item] = i = len(table)
499                table.append(item)
500            index[char] = i
501
502    print(len(table), "unique character type entries")
503    print(sum(map(len, numeric.values())), "numeric code points")
504    print(len(spaces), "whitespace code points")
505    print(len(linebreaks), "linebreak code points")
506    print(len(extra_casing), "extended case array")
507
508    print("--- Writing", FILE, "...")
509
510    with open(FILE, "w") as fp:
511        fprint = partial(print, file=fp)
512
513        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
514        fprint()
515        fprint("/* a list of unique character type descriptors */")
516        fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
517        for item in table:
518            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
519        fprint("};")
520        fprint()
521
522        fprint("/* extended case mappings */")
523        fprint()
524        fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
525        for c in extra_casing:
526            fprint("    %d," % c)
527        fprint("};")
528        fprint()
529
530        # split decomposition index table
531        index1, index2, shift = splitbins(index, trace)
532
533        fprint("/* type indexes */")
534        fprint("#define SHIFT", shift)
535        Array("index1", index1).dump(fp, trace)
536        Array("index2", index2).dump(fp, trace)
537
538        # Generate code for _PyUnicode_ToNumeric()
539        numeric_items = sorted(numeric.items())
540        fprint('/* Returns the numeric value as double for Unicode characters')
541        fprint(' * having this property, -1.0 otherwise.')
542        fprint(' */')
543        fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
544        fprint('{')
545        fprint('    switch (ch) {')
546        for value, codepoints in numeric_items:
547            # Turn text into float literals
548            parts = value.split('/')
549            parts = [repr(float(part)) for part in parts]
550            value = '/'.join(parts)
551
552            codepoints.sort()
553            for codepoint in codepoints:
554                fprint('    case 0x%04X:' % (codepoint,))
555            fprint('        return (double) %s;' % (value,))
556        fprint('    }')
557        fprint('    return -1.0;')
558        fprint('}')
559        fprint()
560
561        # Generate code for _PyUnicode_IsWhitespace()
562        fprint("/* Returns 1 for Unicode characters having the bidirectional")
563        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
564        fprint(" */")
565        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
566        fprint('{')
567        fprint('    switch (ch) {')
568
569        for codepoint in sorted(spaces):
570            fprint('    case 0x%04X:' % (codepoint,))
571        fprint('        return 1;')
572
573        fprint('    }')
574        fprint('    return 0;')
575        fprint('}')
576        fprint()
577
578        # Generate code for _PyUnicode_IsLinebreak()
579        fprint("/* Returns 1 for Unicode characters having the line break")
580        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
581        fprint(" * type 'B', 0 otherwise.")
582        fprint(" */")
583        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
584        fprint('{')
585        fprint('    switch (ch) {')
586        for codepoint in sorted(linebreaks):
587            fprint('    case 0x%04X:' % (codepoint,))
588        fprint('        return 1;')
589
590        fprint('    }')
591        fprint('    return 0;')
592        fprint('}')
593        fprint()
594
595
596# --------------------------------------------------------------------
597# unicode name database
598
599def makeunicodename(unicode, trace):
600
601    FILE = "Modules/unicodename_db.h"
602
603    print("--- Preparing", FILE, "...")
604
605    # collect names
606    names = [None] * len(unicode.chars)
607
608    for char in unicode.chars:
609        record = unicode.table[char]
610        if record:
611            name = record[1].strip()
612            if name and name[0] != "<":
613                names[char] = name + chr(0)
614
615    print(len([n for n in names if n is not None]), "distinct names")
616
617    # collect unique words from names (note that we differ between
618    # words inside a sentence, and words ending a sentence.  the
619    # latter includes the trailing null byte.
620
621    words = {}
622    n = b = 0
623    for char in unicode.chars:
624        name = names[char]
625        if name:
626            w = name.split()
627            b = b + len(name)
628            n = n + len(w)
629            for w in w:
630                l = words.get(w)
631                if l:
632                    l.append(None)
633                else:
634                    words[w] = [len(words)]
635
636    print(n, "words in text;", b, "bytes")
637
638    wordlist = list(words.items())
639
640    # sort on falling frequency, then by name
641    def word_key(a):
642        aword, alist = a
643        return -len(alist), aword
644    wordlist.sort(key=word_key)
645
646    # figure out how many phrasebook escapes we need
647    escapes = 0
648    while escapes * 256 < len(wordlist):
649        escapes = escapes + 1
650    print(escapes, "escapes")
651
652    short = 256 - escapes
653
654    assert short > 0
655
656    print(short, "short indexes in lexicon")
657
658    # statistics
659    n = 0
660    for i in range(short):
661        n = n + len(wordlist[i][1])
662    print(n, "short indexes in phrasebook")
663
664    # pick the most commonly used words, and sort the rest on falling
665    # length (to maximize overlap)
666
667    wordlist, wordtail = wordlist[:short], wordlist[short:]
668    wordtail.sort(key=lambda a: a[0], reverse=True)
669    wordlist.extend(wordtail)
670
671    # generate lexicon from words
672
673    lexicon_offset = [0]
674    lexicon = ""
675    words = {}
676
677    # build a lexicon string
678    offset = 0
679    for w, x in wordlist:
680        # encoding: bit 7 indicates last character in word (chr(128)
681        # indicates the last character in an entire string)
682        ww = w[:-1] + chr(ord(w[-1])+128)
683        # reuse string tails, when possible
684        o = lexicon.find(ww)
685        if o < 0:
686            o = offset
687            lexicon = lexicon + ww
688            offset = offset + len(w)
689        words[w] = len(lexicon_offset)
690        lexicon_offset.append(o)
691
692    lexicon = list(map(ord, lexicon))
693
694    # generate phrasebook from names and lexicon
695    phrasebook = [0]
696    phrasebook_offset = [0] * len(unicode.chars)
697    for char in unicode.chars:
698        name = names[char]
699        if name:
700            w = name.split()
701            phrasebook_offset[char] = len(phrasebook)
702            for w in w:
703                i = words[w]
704                if i < short:
705                    phrasebook.append(i)
706                else:
707                    # store as two bytes
708                    phrasebook.append((i>>8) + short)
709                    phrasebook.append(i&255)
710
711    assert getsize(phrasebook) == 1
712
713    #
714    # unicode name hash table
715
716    # extract names
717    data = []
718    for char in unicode.chars:
719        record = unicode.table[char]
720        if record:
721            name = record[1].strip()
722            if name and name[0] != "<":
723                data.append((name, char))
724
725    # the magic number 47 was chosen to minimize the number of
726    # collisions on the current data set.  if you like, change it
727    # and see what happens...
728
729    codehash = Hash("code", data, 47)
730
731    print("--- Writing", FILE, "...")
732
733    with open(FILE, "w") as fp:
734        fprint = partial(print, file=fp)
735
736        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
737        fprint()
738        fprint("#define NAME_MAXLEN", 256)
739        fprint()
740        fprint("/* lexicon */")
741        Array("lexicon", lexicon).dump(fp, trace)
742        Array("lexicon_offset", lexicon_offset).dump(fp, trace)
743
744        # split decomposition index table
745        offset1, offset2, shift = splitbins(phrasebook_offset, trace)
746
747        fprint("/* code->name phrasebook */")
748        fprint("#define phrasebook_shift", shift)
749        fprint("#define phrasebook_short", short)
750
751        Array("phrasebook", phrasebook).dump(fp, trace)
752        Array("phrasebook_offset1", offset1).dump(fp, trace)
753        Array("phrasebook_offset2", offset2).dump(fp, trace)
754
755        fprint("/* name->code dictionary */")
756        codehash.dump(fp, trace)
757
758        fprint()
759        fprint('static const unsigned int aliases_start = %#x;' %
760               NAME_ALIASES_START)
761        fprint('static const unsigned int aliases_end = %#x;' %
762               (NAME_ALIASES_START + len(unicode.aliases)))
763
764        fprint('static const unsigned int name_aliases[] = {')
765        for name, codepoint in unicode.aliases:
766            fprint('    0x%04X,' % codepoint)
767        fprint('};')
768
769        # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
770        # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
771        # sequences or sequences with non-BMP chars are added.
772        # unicodedata_lookup should be adapted too.
773        fprint(dedent("""
774            typedef struct NamedSequence {
775                int seqlen;
776                Py_UCS2 seq[4];
777            } named_sequence;
778            """))
779
780        fprint('static const unsigned int named_sequences_start = %#x;' %
781               NAMED_SEQUENCES_START)
782        fprint('static const unsigned int named_sequences_end = %#x;' %
783               (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
784
785        fprint('static const named_sequence named_sequences[] = {')
786        for name, sequence in unicode.named_sequences:
787            seq_str = ', '.join('0x%04X' % cp for cp in sequence)
788            fprint('    {%d, {%s}},' % (len(sequence), seq_str))
789        fprint('};')
790
791
792def merge_old_version(version, new, old):
793    # Changes to exclusion file not implemented yet
794    if old.exclusions != new.exclusions:
795        raise NotImplementedError("exclusions differ")
796
797    # In these change records, 0xFF means "no change"
798    bidir_changes = [0xFF]*0x110000
799    category_changes = [0xFF]*0x110000
800    decimal_changes = [0xFF]*0x110000
801    mirrored_changes = [0xFF]*0x110000
802    east_asian_width_changes = [0xFF]*0x110000
803    # In numeric data, 0 means "no change",
804    # -1 means "did not have a numeric value
805    numeric_changes = [0] * 0x110000
806    # normalization_changes is a list of key-value pairs
807    normalization_changes = []
808    for i in range(0x110000):
809        if new.table[i] is None:
810            # Characters unassigned in the new version ought to
811            # be unassigned in the old one
812            assert old.table[i] is None
813            continue
814        # check characters unassigned in the old version
815        if old.table[i] is None:
816            # category 0 is "unassigned"
817            category_changes[i] = 0
818            continue
819        # check characters that differ
820        if old.table[i] != new.table[i]:
821            for k in range(len(old.table[i])):
822                if old.table[i][k] != new.table[i][k]:
823                    value = old.table[i][k]
824                    if k == 1 and i in PUA_15:
825                        # the name is not set in the old.table, but in the
826                        # new.table we are using it for aliases and named seq
827                        assert value == ''
828                    elif k == 2:
829                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
830                        category_changes[i] = CATEGORY_NAMES.index(value)
831                    elif k == 4:
832                        #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
833                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
834                    elif k == 5:
835                        #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
836                        # We assume that all normalization changes are in 1:1 mappings
837                        assert " " not in value
838                        normalization_changes.append((i, value))
839                    elif k == 6:
840                        #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
841                        # we only support changes where the old value is a single digit
842                        assert value in "0123456789"
843                        decimal_changes[i] = int(value)
844                    elif k == 8:
845                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
846                        # Since 0 encodes "no change", the old value is better not 0
847                        if not value:
848                            numeric_changes[i] = -1
849                        else:
850                            numeric_changes[i] = float(value)
851                            assert numeric_changes[i] not in (0, -1)
852                    elif k == 9:
853                        if value == 'Y':
854                            mirrored_changes[i] = '1'
855                        else:
856                            mirrored_changes[i] = '0'
857                    elif k == 11:
858                        # change to ISO comment, ignore
859                        pass
860                    elif k == 12:
861                        # change to simple uppercase mapping; ignore
862                        pass
863                    elif k == 13:
864                        # change to simple lowercase mapping; ignore
865                        pass
866                    elif k == 14:
867                        # change to simple titlecase mapping; ignore
868                        pass
869                    elif k == 15:
870                        # change to east asian width
871                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
872                    elif k == 16:
873                        # derived property changes; not yet
874                        pass
875                    elif k == 17:
876                        # normalization quickchecks are not performed
877                        # for older versions
878                        pass
879                    else:
880                        class Difference(Exception):pass
881                        raise Difference(hex(i), k, old.table[i], new.table[i])
882    new.changed.append((version, list(zip(bidir_changes, category_changes,
883                                          decimal_changes, mirrored_changes,
884                                          east_asian_width_changes,
885                                          numeric_changes)),
886                        normalization_changes))
887
888
889def open_data(template, version):
890    local = template % ('-'+version,)
891    if not os.path.exists(local):
892        import urllib.request
893        if version == '3.2.0':
894            # irregular url structure
895            url = 'http://www.unicode.org/Public/3.2-Update/' + local
896        else:
897            url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
898        urllib.request.urlretrieve(url, filename=local)
899    if local.endswith('.txt'):
900        return open(local, encoding='utf-8')
901    else:
902        # Unihan.zip
903        return open(local, 'rb')
904
905
906# --------------------------------------------------------------------
907# the following support code is taken from the unidb utilities
908# Copyright (c) 1999-2000 by Secret Labs AB
909
910# load a unicode-data file from disk
911
912class UnicodeData:
913    # Record structure:
914    # [ID, name, category, combining, bidi, decomp,  (6)
915    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
916    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
917    #  derived-props] (17)
918
919    def __init__(self, version,
920                 linebreakprops=False,
921                 expand=1,
922                 cjk_check=True):
923        self.changed = []
924        table = [None] * 0x110000
925        with open_data(UNICODE_DATA, version) as file:
926            while 1:
927                s = file.readline()
928                if not s:
929                    break
930                s = s.strip().split(";")
931                char = int(s[0], 16)
932                table[char] = s
933
934        cjk_ranges_found = []
935
936        # expand first-last ranges
937        if expand:
938            field = None
939            for i in range(0, 0x110000):
940                s = table[i]
941                if s:
942                    if s[1][-6:] == "First>":
943                        s[1] = ""
944                        field = s
945                    elif s[1][-5:] == "Last>":
946                        if s[1].startswith("<CJK Ideograph"):
947                            cjk_ranges_found.append((field[0],
948                                                     s[0]))
949                        s[1] = ""
950                        field = None
951                elif field:
952                    f2 = field[:]
953                    f2[0] = "%X" % i
954                    table[i] = f2
955            if cjk_check and cjk_ranges != cjk_ranges_found:
956                raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
957
958        # public attributes
959        self.filename = UNICODE_DATA % ''
960        self.table = table
961        self.chars = list(range(0x110000)) # unicode 3.2
962
963        # check for name aliases and named sequences, see #12753
964        # aliases and named sequences are not in 3.2.0
965        if version != '3.2.0':
966            self.aliases = []
967            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
968            # in order to take advantage of the compression and lookup
969            # algorithms used for the other characters
970            pua_index = NAME_ALIASES_START
971            with open_data(NAME_ALIASES, version) as file:
972                for s in file:
973                    s = s.strip()
974                    if not s or s.startswith('#'):
975                        continue
976                    char, name, abbrev = s.split(';')
977                    char = int(char, 16)
978                    self.aliases.append((name, char))
979                    # also store the name in the PUA 1
980                    self.table[pua_index][1] = name
981                    pua_index += 1
982            assert pua_index - NAME_ALIASES_START == len(self.aliases)
983
984            self.named_sequences = []
985            # store named sequences in the PUA 1, in range U+F0100..,
986            # in order to take advantage of the compression and lookup
987            # algorithms used for the other characters.
988
989            assert pua_index < NAMED_SEQUENCES_START
990            pua_index = NAMED_SEQUENCES_START
991            with open_data(NAMED_SEQUENCES, version) as file:
992                for s in file:
993                    s = s.strip()
994                    if not s or s.startswith('#'):
995                        continue
996                    name, chars = s.split(';')
997                    chars = tuple(int(char, 16) for char in chars.split())
998                    # check that the structure defined in makeunicodename is OK
999                    assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1000                    assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1001                        "the NamedSequence struct and in unicodedata_lookup")
1002                    self.named_sequences.append((name, chars))
1003                    # also store these in the PUA 1
1004                    self.table[pua_index][1] = name
1005                    pua_index += 1
1006            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1007
1008        self.exclusions = {}
1009        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1010            for s in file:
1011                s = s.strip()
1012                if not s:
1013                    continue
1014                if s[0] == '#':
1015                    continue
1016                char = int(s.split()[0],16)
1017                self.exclusions[char] = 1
1018
1019        widths = [None] * 0x110000
1020        with open_data(EASTASIAN_WIDTH, version) as file:
1021            for s in file:
1022                s = s.strip()
1023                if not s:
1024                    continue
1025                if s[0] == '#':
1026                    continue
1027                s = s.split()[0].split(';')
1028                if '..' in s[0]:
1029                    first, last = [int(c, 16) for c in s[0].split('..')]
1030                    chars = list(range(first, last+1))
1031                else:
1032                    chars = [int(s[0], 16)]
1033                for char in chars:
1034                    widths[char] = s[1]
1035
1036        for i in range(0, 0x110000):
1037            if table[i] is not None:
1038                table[i].append(widths[i])
1039
1040        for i in range(0, 0x110000):
1041            if table[i] is not None:
1042                table[i].append(set())
1043
1044        with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1045            for s in file:
1046                s = s.split('#', 1)[0].strip()
1047                if not s:
1048                    continue
1049
1050                r, p = s.split(";")
1051                r = r.strip()
1052                p = p.strip()
1053                if ".." in r:
1054                    first, last = [int(c, 16) for c in r.split('..')]
1055                    chars = list(range(first, last+1))
1056                else:
1057                    chars = [int(r, 16)]
1058                for char in chars:
1059                    if table[char]:
1060                        # Some properties (e.g. Default_Ignorable_Code_Point)
1061                        # apply to unassigned code points; ignore them
1062                        table[char][-1].add(p)
1063
1064        with open_data(LINE_BREAK, version) as file:
1065            for s in file:
1066                s = s.partition('#')[0]
1067                s = [i.strip() for i in s.split(';')]
1068                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1069                    continue
1070                if '..' not in s[0]:
1071                    first = last = int(s[0], 16)
1072                else:
1073                    first, last = [int(c, 16) for c in s[0].split('..')]
1074                for char in range(first, last+1):
1075                    table[char][-1].add('Line_Break')
1076
1077        # We only want the quickcheck properties
1078        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1079        # Yes is the default, hence only N and M occur
1080        # In 3.2.0, the format was different (NF?_NO)
1081        # The parsing will incorrectly determine these as
1082        # "yes", however, unicodedata.c will not perform quickchecks
1083        # for older versions, and no delta records will be created.
1084        quickchecks = [0] * 0x110000
1085        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1086        with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1087            for s in file:
1088                if '#' in s:
1089                    s = s[:s.index('#')]
1090                s = [i.strip() for i in s.split(';')]
1091                if len(s) < 2 or s[1] not in qc_order:
1092                    continue
1093                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1094                quickcheck_shift = qc_order.index(s[1])*2
1095                quickcheck <<= quickcheck_shift
1096                if '..' not in s[0]:
1097                    first = last = int(s[0], 16)
1098                else:
1099                    first, last = [int(c, 16) for c in s[0].split('..')]
1100                for char in range(first, last+1):
1101                    assert not (quickchecks[char]>>quickcheck_shift)&3
1102                    quickchecks[char] |= quickcheck
1103        for i in range(0, 0x110000):
1104            if table[i] is not None:
1105                table[i].append(quickchecks[i])
1106
1107        with open_data(UNIHAN, version) as file:
1108            zip = zipfile.ZipFile(file)
1109            if version == '3.2.0':
1110                data = zip.open('Unihan-3.2.0.txt').read()
1111            else:
1112                data = zip.open('Unihan_NumericValues.txt').read()
1113        for line in data.decode("utf-8").splitlines():
1114            if not line.startswith('U+'):
1115                continue
1116            code, tag, value = line.split(None, 3)[:3]
1117            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1118                           'kOtherNumeric'):
1119                continue
1120            value = value.strip().replace(',', '')
1121            i = int(code[2:], 16)
1122            # Patch the numeric field
1123            if table[i] is not None:
1124                table[i][8] = value
1125        sc = self.special_casing = {}
1126        with open_data(SPECIAL_CASING, version) as file:
1127            for s in file:
1128                s = s[:-1].split('#', 1)[0]
1129                if not s:
1130                    continue
1131                data = s.split("; ")
1132                if data[4]:
1133                    # We ignore all conditionals (since they depend on
1134                    # languages) except for one, which is hardcoded. See
1135                    # handle_capital_sigma in unicodeobject.c.
1136                    continue
1137                c = int(data[0], 16)
1138                lower = [int(char, 16) for char in data[1].split()]
1139                title = [int(char, 16) for char in data[2].split()]
1140                upper = [int(char, 16) for char in data[3].split()]
1141                sc[c] = (lower, title, upper)
1142        cf = self.case_folding = {}
1143        if version != '3.2.0':
1144            with open_data(CASE_FOLDING, version) as file:
1145                for s in file:
1146                    s = s[:-1].split('#', 1)[0]
1147                    if not s:
1148                        continue
1149                    data = s.split("; ")
1150                    if data[1] in "CF":
1151                        c = int(data[0], 16)
1152                        cf[c] = [int(char, 16) for char in data[2].split()]
1153
1154    def uselatin1(self):
1155        # restrict character range to ISO Latin 1
1156        self.chars = list(range(256))
1157
1158
1159# hash table tools
1160
1161# this is a straight-forward reimplementation of Python's built-in
1162# dictionary type, using a static data structure, and a custom string
1163# hash algorithm.
1164
1165def myhash(s, magic):
1166    h = 0
1167    for c in map(ord, s.upper()):
1168        h = (h * magic) + c
1169        ix = h & 0xff000000
1170        if ix:
1171            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1172    return h
1173
1174
1175SIZES = [
1176    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1177    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1178    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1179    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1180]
1181
1182
1183class Hash:
1184    def __init__(self, name, data, magic):
1185        # turn a (key, value) list into a static hash table structure
1186
1187        # determine table size
1188        for size, poly in SIZES:
1189            if size > len(data):
1190                poly = size + poly
1191                break
1192        else:
1193            raise AssertionError("ran out of polynomials")
1194
1195        print(size, "slots in hash table")
1196
1197        table = [None] * size
1198
1199        mask = size-1
1200
1201        n = 0
1202
1203        hash = myhash
1204
1205        # initialize hash table
1206        for key, value in data:
1207            h = hash(key, magic)
1208            i = (~h) & mask
1209            v = table[i]
1210            if v is None:
1211                table[i] = value
1212                continue
1213            incr = (h ^ (h >> 3)) & mask
1214            if not incr:
1215                incr = mask
1216            while 1:
1217                n = n + 1
1218                i = (i + incr) & mask
1219                v = table[i]
1220                if v is None:
1221                    table[i] = value
1222                    break
1223                incr = incr << 1
1224                if incr > mask:
1225                    incr = incr ^ poly
1226
1227        print(n, "collisions")
1228        self.collisions = n
1229
1230        for i in range(len(table)):
1231            if table[i] is None:
1232                table[i] = 0
1233
1234        self.data = Array(name + "_hash", table)
1235        self.magic = magic
1236        self.name = name
1237        self.size = size
1238        self.poly = poly
1239
1240    def dump(self, file, trace):
1241        # write data to file, as a C array
1242        self.data.dump(file, trace)
1243        file.write("#define %s_magic %d\n" % (self.name, self.magic))
1244        file.write("#define %s_size %d\n" % (self.name, self.size))
1245        file.write("#define %s_poly %d\n" % (self.name, self.poly))
1246
1247
1248# stuff to deal with arrays of unsigned integers
1249
1250class Array:
1251
1252    def __init__(self, name, data):
1253        self.name = name
1254        self.data = data
1255
1256    def dump(self, file, trace=0):
1257        # write data to file, as a C array
1258        size = getsize(self.data)
1259        if trace:
1260            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1261        file.write("static const ")
1262        if size == 1:
1263            file.write("unsigned char")
1264        elif size == 2:
1265            file.write("unsigned short")
1266        else:
1267            file.write("unsigned int")
1268        file.write(" " + self.name + "[] = {\n")
1269        if self.data:
1270            s = "    "
1271            for item in self.data:
1272                i = str(item) + ", "
1273                if len(s) + len(i) > 78:
1274                    file.write(s.rstrip() + "\n")
1275                    s = "    " + i
1276                else:
1277                    s = s + i
1278            if s.strip():
1279                file.write(s.rstrip() + "\n")
1280        file.write("};\n\n")
1281
1282
1283def getsize(data):
1284    # return smallest possible integer size for the given array
1285    maxdata = max(data)
1286    if maxdata < 256:
1287        return 1
1288    elif maxdata < 65536:
1289        return 2
1290    else:
1291        return 4
1292
1293
1294def splitbins(t, trace=0):
1295    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1296
1297    t is a sequence of ints.  This function can be useful to save space if
1298    many of the ints are the same.  t1 and t2 are lists of ints, and shift
1299    is an int, chosen to minimize the combined size of t1 and t2 (in C
1300    code), and where for each i in range(len(t)),
1301        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1302    where mask is a bitmask isolating the last "shift" bits.
1303
1304    If optional arg trace is non-zero (default zero), progress info
1305    is printed to sys.stderr.  The higher the value, the more info
1306    you'll get.
1307    """
1308
1309    if trace:
1310        def dump(t1, t2, shift, bytes):
1311            print("%d+%d bins at shift %d; %d bytes" % (
1312                len(t1), len(t2), shift, bytes), file=sys.stderr)
1313        print("Size of original table:", len(t)*getsize(t), "bytes",
1314              file=sys.stderr)
1315    n = len(t)-1    # last valid index
1316    maxshift = 0    # the most we can shift n and still have something left
1317    if n > 0:
1318        while n >> 1:
1319            n >>= 1
1320            maxshift += 1
1321    del n
1322    bytes = sys.maxsize  # smallest total size so far
1323    t = tuple(t)    # so slices can be dict keys
1324    for shift in range(maxshift + 1):
1325        t1 = []
1326        t2 = []
1327        size = 2**shift
1328        bincache = {}
1329        for i in range(0, len(t), size):
1330            bin = t[i:i+size]
1331            index = bincache.get(bin)
1332            if index is None:
1333                index = len(t2)
1334                bincache[bin] = index
1335                t2.extend(bin)
1336            t1.append(index >> shift)
1337        # determine memory size
1338        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1339        if trace > 1:
1340            dump(t1, t2, shift, b)
1341        if b < bytes:
1342            best = t1, t2, shift
1343            bytes = b
1344    t1, t2, shift = best
1345    if trace:
1346        print("Best:", end=' ', file=sys.stderr)
1347        dump(t1, t2, shift, bytes)
1348    if __debug__:
1349        # exhaustively verify that the decomposition is correct
1350        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1351        for i in range(len(t)):
1352            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1353    return best
1354
1355
1356if __name__ == "__main__":
1357    maketables(1)
1358