• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# (re)generate unicode property and type databases
3#
4# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
6#
7# history:
8# 2000-09-24 fl   created (based on bits and pieces from unidb)
9# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
10# 2000-09-25 fl   added character type table
11# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12# 2000-11-03 fl   expand first/last ranges
13# 2001-01-19 fl   added character name tables (2.1)
14# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
15# 2002-09-11 wd   use string methods
16# 2002-10-18 mvl  update to Unicode 3.2
17# 2002-10-22 mvl  generate NFC tables
18# 2002-11-24 mvl  expand all ranges, sort names version-independently
19# 2002-11-25 mvl  add UNIDATA_VERSION
20# 2004-05-29 perky add east asian width information
21# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
22# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23# 2011-10-21 ezio add support for name aliases and named sequences
24# 2012-01    benjamin add full case mappings
25#
26# written by Fredrik Lundh (fredrik@pythonware.com)
27#
28
29import dataclasses
30import os
31import sys
32import zipfile
33
34from functools import partial
35from textwrap import dedent
36from typing import Iterator, List, Optional, Set, Tuple
37
38SCRIPT = os.path.normpath(sys.argv[0])
39VERSION = "3.3"
40
41# The Unicode Database
42# --------------------
43# When changing UCD version please update
44#   * Doc/library/stdtypes.rst, and
45#   * Doc/library/unicodedata.rst
46#   * Doc/reference/lexical_analysis.rst (two occurrences)
47UNIDATA_VERSION = "15.1.0"
48UNICODE_DATA = "UnicodeData%s.txt"
49COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
51UNIHAN = "Unihan%s.zip"
52DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
53DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
54LINE_BREAK = "LineBreak%s.txt"
55NAME_ALIASES = "NameAliases%s.txt"
56NAMED_SEQUENCES = "NamedSequences%s.txt"
57SPECIAL_CASING = "SpecialCasing%s.txt"
58CASE_FOLDING = "CaseFolding%s.txt"
59
60# Private Use Areas -- in planes 1, 15, 16
61PUA_1 = range(0xE000, 0xF900)
62PUA_15 = range(0xF0000, 0xFFFFE)
63PUA_16 = range(0x100000, 0x10FFFE)
64
65# we use this ranges of PUA_15 to store name aliases and named sequences
66NAME_ALIASES_START = 0xF0000
67NAMED_SEQUENCES_START = 0xF0200
68
69old_versions = ["3.2.0"]
70
71CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74    "So" ]
75
76BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
78    "ON", "LRI", "RLI", "FSI", "PDI" ]
79
80# "N" needs to be the first entry, see the comment in makeunicodedata
81EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
82
83MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
84
85# note: should match definitions in Objects/unicodectype.c
86ALPHA_MASK = 0x01
87DECIMAL_MASK = 0x02
88DIGIT_MASK = 0x04
89LOWER_MASK = 0x08
90LINEBREAK_MASK = 0x10
91SPACE_MASK = 0x20
92TITLE_MASK = 0x40
93UPPER_MASK = 0x80
94XID_START_MASK = 0x100
95XID_CONTINUE_MASK = 0x200
96PRINTABLE_MASK = 0x400
97NUMERIC_MASK = 0x800
98CASE_IGNORABLE_MASK = 0x1000
99CASED_MASK = 0x2000
100EXTENDED_CASE_MASK = 0x4000
101
102# these ranges need to match unicodedata.c:is_unified_ideograph
103cjk_ranges = [
104    ('3400', '4DBF'),    # CJK Ideograph Extension A CJK
105    ('4E00', '9FFF'),    # CJK Ideograph
106    ('20000', '2A6DF'),  # CJK Ideograph Extension B
107    ('2A700', '2B739'),  # CJK Ideograph Extension C
108    ('2B740', '2B81D'),  # CJK Ideograph Extension D
109    ('2B820', '2CEA1'),  # CJK Ideograph Extension E
110    ('2CEB0', '2EBE0'),  # CJK Ideograph Extension F
111    ('2EBF0', '2EE5D'),  # CJK Ideograph Extension I
112    ('30000', '3134A'),  # CJK Ideograph Extension G
113    ('31350', '323AF'),  # CJK Ideograph Extension H
114]
115
116
117def maketables(trace=0):
118
119    print("--- Reading", UNICODE_DATA % "", "...")
120
121    unicode = UnicodeData(UNIDATA_VERSION)
122
123    print(len(list(filter(None, unicode.table))), "characters")
124
125    for version in old_versions:
126        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
127        old_unicode = UnicodeData(version, cjk_check=False)
128        print(len(list(filter(None, old_unicode.table))), "characters")
129        merge_old_version(version, unicode, old_unicode)
130
131    makeunicodename(unicode, trace)
132    makeunicodedata(unicode, trace)
133    makeunicodetype(unicode, trace)
134
135
136# --------------------------------------------------------------------
137# unicode character properties
138
139def makeunicodedata(unicode, trace):
140
141    # the default value of east_asian_width is "N", for unassigned code points
142    # not mentioned in EastAsianWidth.txt
143    # in addition there are some reserved but unassigned code points in CJK
144    # ranges that are classified as "W". code points in private use areas
145    # have a width of "A". both of these have entries in
146    # EastAsianWidth.txt
147    # see https://unicode.org/reports/tr11/#Unassigned
148    assert EASTASIANWIDTH_NAMES[0] == "N"
149    dummy = (0, 0, 0, 0, 0, 0)
150    table = [dummy]
151    cache = {0: dummy}
152    index = [0] * len(unicode.chars)
153
154    FILE = "Modules/unicodedata_db.h"
155
156    print("--- Preparing", FILE, "...")
157
158    # 1) database properties
159
160    for char in unicode.chars:
161        record = unicode.table[char]
162        if record:
163            # extract database properties
164            category = CATEGORY_NAMES.index(record.general_category)
165            combining = int(record.canonical_combining_class)
166            bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
167            mirrored = record.bidi_mirrored == "Y"
168            eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
169            normalizationquickcheck = record.quick_check
170            item = (
171                category, combining, bidirectional, mirrored, eastasianwidth,
172                normalizationquickcheck
173                )
174        elif unicode.widths[char] is not None:
175            # an unassigned but reserved character, with a known
176            # east_asian_width
177            eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
178            item = (0, 0, 0, 0, eastasianwidth, 0)
179        else:
180            continue
181
182        # add entry to index and item tables
183        i = cache.get(item)
184        if i is None:
185            cache[item] = i = len(table)
186            table.append(item)
187        index[char] = i
188
189    # 2) decomposition data
190
191    decomp_data_cache = {}
192    decomp_data = [0]
193    decomp_prefix = [""]
194    decomp_index = [0] * len(unicode.chars)
195    decomp_size = 0
196
197    comp_pairs = []
198    comp_first = [None] * len(unicode.chars)
199    comp_last = [None] * len(unicode.chars)
200
201    for char in unicode.chars:
202        record = unicode.table[char]
203        if record:
204            if record.decomposition_type:
205                decomp = record.decomposition_type.split()
206                if len(decomp) > 19:
207                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
208                # prefix
209                if decomp[0][0] == "<":
210                    prefix = decomp.pop(0)
211                else:
212                    prefix = ""
213                try:
214                    i = decomp_prefix.index(prefix)
215                except ValueError:
216                    i = len(decomp_prefix)
217                    decomp_prefix.append(prefix)
218                prefix = i
219                assert prefix < 256
220                # content
221                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
222                # Collect NFC pairs
223                if not prefix and len(decomp) == 3 and \
224                   char not in unicode.exclusions and \
225                   unicode.table[decomp[1]].canonical_combining_class == "0":
226                    p, l, r = decomp
227                    comp_first[l] = 1
228                    comp_last[r] = 1
229                    comp_pairs.append((l,r,char))
230                key = tuple(decomp)
231                i = decomp_data_cache.get(key, -1)
232                if i == -1:
233                    i = len(decomp_data)
234                    decomp_data.extend(decomp)
235                    decomp_size = decomp_size + len(decomp) * 2
236                    decomp_data_cache[key] = i
237                else:
238                    assert decomp_data[i:i+len(decomp)] == decomp
239            else:
240                i = 0
241            decomp_index[char] = i
242
243    f = l = 0
244    comp_first_ranges = []
245    comp_last_ranges = []
246    prev_f = prev_l = None
247    for i in unicode.chars:
248        if comp_first[i] is not None:
249            comp_first[i] = f
250            f += 1
251            if prev_f is None:
252                prev_f = (i,i)
253            elif prev_f[1]+1 == i:
254                prev_f = prev_f[0],i
255            else:
256                comp_first_ranges.append(prev_f)
257                prev_f = (i,i)
258        if comp_last[i] is not None:
259            comp_last[i] = l
260            l += 1
261            if prev_l is None:
262                prev_l = (i,i)
263            elif prev_l[1]+1 == i:
264                prev_l = prev_l[0],i
265            else:
266                comp_last_ranges.append(prev_l)
267                prev_l = (i,i)
268    comp_first_ranges.append(prev_f)
269    comp_last_ranges.append(prev_l)
270    total_first = f
271    total_last = l
272
273    comp_data = [0]*(total_first*total_last)
274    for f,l,char in comp_pairs:
275        f = comp_first[f]
276        l = comp_last[l]
277        comp_data[f*total_last+l] = char
278
279    print(len(table), "unique properties")
280    print(len(decomp_prefix), "unique decomposition prefixes")
281    print(len(decomp_data), "unique decomposition entries:", end=' ')
282    print(decomp_size, "bytes")
283    print(total_first, "first characters in NFC")
284    print(total_last, "last characters in NFC")
285    print(len(comp_pairs), "NFC pairs")
286
287    print("--- Writing", FILE, "...")
288
289    with open(FILE, "w") as fp:
290        fprint = partial(print, file=fp)
291
292        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
293        fprint()
294        fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
295        fprint("/* a list of unique database records */")
296        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
297        for item in table:
298            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
299        fprint("};")
300        fprint()
301
302        fprint("/* Reindexing of NFC first characters. */")
303        fprint("#define TOTAL_FIRST",total_first)
304        fprint("#define TOTAL_LAST",total_last)
305        fprint("struct reindex{int start;short count,index;};")
306        fprint("static struct reindex nfc_first[] = {")
307        for start,end in comp_first_ranges:
308            fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
309        fprint("    {0,0,0}")
310        fprint("};\n")
311        fprint("static struct reindex nfc_last[] = {")
312        for start,end in comp_last_ranges:
313            fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
314        fprint("  {0,0,0}")
315        fprint("};\n")
316
317        # FIXME: <fl> the following tables could be made static, and
318        # the support code moved into unicodedatabase.c
319
320        fprint("/* string literals */")
321        fprint("const char *_PyUnicode_CategoryNames[] = {")
322        for name in CATEGORY_NAMES:
323            fprint("    \"%s\"," % name)
324        fprint("    NULL")
325        fprint("};")
326
327        fprint("const char *_PyUnicode_BidirectionalNames[] = {")
328        for name in BIDIRECTIONAL_NAMES:
329            fprint("    \"%s\"," % name)
330        fprint("    NULL")
331        fprint("};")
332
333        fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
334        for name in EASTASIANWIDTH_NAMES:
335            fprint("    \"%s\"," % name)
336        fprint("    NULL")
337        fprint("};")
338
339        fprint("static const char *decomp_prefix[] = {")
340        for name in decomp_prefix:
341            fprint("    \"%s\"," % name)
342        fprint("    NULL")
343        fprint("};")
344
345        # split record index table
346        index1, index2, shift = splitbins(index, trace)
347
348        fprint("/* index tables for the database records */")
349        fprint("#define SHIFT", shift)
350        Array("index1", index1).dump(fp, trace)
351        Array("index2", index2).dump(fp, trace)
352
353        # split decomposition index table
354        index1, index2, shift = splitbins(decomp_index, trace)
355
356        fprint("/* decomposition data */")
357        Array("decomp_data", decomp_data).dump(fp, trace)
358
359        fprint("/* index tables for the decomposition data */")
360        fprint("#define DECOMP_SHIFT", shift)
361        Array("decomp_index1", index1).dump(fp, trace)
362        Array("decomp_index2", index2).dump(fp, trace)
363
364        index, index2, shift = splitbins(comp_data, trace)
365        fprint("/* NFC pairs */")
366        fprint("#define COMP_SHIFT", shift)
367        Array("comp_index", index).dump(fp, trace)
368        Array("comp_data", index2).dump(fp, trace)
369
370        # Generate delta tables for old versions
371        for version, table, normalization in unicode.changed:
372            cversion = version.replace(".","_")
373            records = [table[0]]
374            cache = {table[0]:0}
375            index = [0] * len(table)
376            for i, record in enumerate(table):
377                try:
378                    index[i] = cache[record]
379                except KeyError:
380                    index[i] = cache[record] = len(records)
381                    records.append(record)
382            index1, index2, shift = splitbins(index, trace)
383            fprint("static const change_record change_records_%s[] = {" % cversion)
384            for record in records:
385                fprint("    { %s }," % ", ".join(map(str,record)))
386            fprint("};")
387            Array("changes_%s_index" % cversion, index1).dump(fp, trace)
388            Array("changes_%s_data" % cversion, index2).dump(fp, trace)
389            fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
390            fprint("{")
391            fprint("    int index;")
392            fprint("    if (n >= 0x110000) index = 0;")
393            fprint("    else {")
394            fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
395            fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
396                   (cversion, shift, ((1<<shift)-1)))
397            fprint("    }")
398            fprint("    return change_records_%s+index;" % cversion)
399            fprint("}\n")
400            fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
401            fprint("{")
402            fprint("    switch(n) {")
403            for k, v in normalization:
404                fprint("    case %s: return 0x%s;" % (hex(k), v))
405            fprint("    default: return 0;")
406            fprint("    }\n}\n")
407
408
409# --------------------------------------------------------------------
410# unicode character type tables
411
412def makeunicodetype(unicode, trace):
413
414    FILE = "Objects/unicodetype_db.h"
415
416    print("--- Preparing", FILE, "...")
417
418    # extract unicode types
419    dummy = (0, 0, 0, 0, 0, 0)
420    table = [dummy]
421    cache = {dummy: 0}
422    index = [0] * len(unicode.chars)
423    numeric = {}
424    spaces = []
425    linebreaks = []
426    extra_casing = []
427
428    for char in unicode.chars:
429        record = unicode.table[char]
430        if record:
431            # extract database properties
432            category = record.general_category
433            bidirectional = record.bidi_class
434            properties = record.binary_properties
435            flags = 0
436            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
437                flags |= ALPHA_MASK
438            if "Lowercase" in properties:
439                flags |= LOWER_MASK
440            if 'Line_Break' in properties or bidirectional == "B":
441                flags |= LINEBREAK_MASK
442                linebreaks.append(char)
443            if category == "Zs" or bidirectional in ("WS", "B", "S"):
444                flags |= SPACE_MASK
445                spaces.append(char)
446            if category == "Lt":
447                flags |= TITLE_MASK
448            if "Uppercase" in properties:
449                flags |= UPPER_MASK
450            if char == ord(" ") or category[0] not in ("C", "Z"):
451                flags |= PRINTABLE_MASK
452            if "XID_Start" in properties:
453                flags |= XID_START_MASK
454            if "XID_Continue" in properties:
455                flags |= XID_CONTINUE_MASK
456            if "Cased" in properties:
457                flags |= CASED_MASK
458            if "Case_Ignorable" in properties:
459                flags |= CASE_IGNORABLE_MASK
460            sc = unicode.special_casing.get(char)
461            cf = unicode.case_folding.get(char, [char])
462            if record.simple_uppercase_mapping:
463                upper = int(record.simple_uppercase_mapping, 16)
464            else:
465                upper = char
466            if record.simple_lowercase_mapping:
467                lower = int(record.simple_lowercase_mapping, 16)
468            else:
469                lower = char
470            if record.simple_titlecase_mapping:
471                title = int(record.simple_titlecase_mapping, 16)
472            else:
473                title = upper
474            if sc is None and cf != [lower]:
475                sc = ([lower], [title], [upper])
476            if sc is None:
477                if upper == lower == title:
478                    upper = lower = title = 0
479                else:
480                    upper = upper - char
481                    lower = lower - char
482                    title = title - char
483                    assert (abs(upper) <= 2147483647 and
484                            abs(lower) <= 2147483647 and
485                            abs(title) <= 2147483647)
486            else:
487                # This happens either when some character maps to more than one
488                # character in uppercase, lowercase, or titlecase or the
489                # casefolded version of the character is different from the
490                # lowercase. The extra characters are stored in a different
491                # array.
492                flags |= EXTENDED_CASE_MASK
493                lower = len(extra_casing) | (len(sc[0]) << 24)
494                extra_casing.extend(sc[0])
495                if cf != sc[0]:
496                    lower |= len(cf) << 20
497                    extra_casing.extend(cf)
498                upper = len(extra_casing) | (len(sc[2]) << 24)
499                extra_casing.extend(sc[2])
500                # Title is probably equal to upper.
501                if sc[1] == sc[2]:
502                    title = upper
503                else:
504                    title = len(extra_casing) | (len(sc[1]) << 24)
505                    extra_casing.extend(sc[1])
506            # decimal digit, integer digit
507            decimal = 0
508            if record.decomposition_mapping:
509                flags |= DECIMAL_MASK
510                decimal = int(record.decomposition_mapping)
511            digit = 0
512            if record.numeric_type:
513                flags |= DIGIT_MASK
514                digit = int(record.numeric_type)
515            if record.numeric_value:
516                flags |= NUMERIC_MASK
517                numeric.setdefault(record.numeric_value, []).append(char)
518            item = (
519                upper, lower, title, decimal, digit, flags
520                )
521            # add entry to index and item tables
522            i = cache.get(item)
523            if i is None:
524                cache[item] = i = len(table)
525                table.append(item)
526            index[char] = i
527
528    print(len(table), "unique character type entries")
529    print(sum(map(len, numeric.values())), "numeric code points")
530    print(len(spaces), "whitespace code points")
531    print(len(linebreaks), "linebreak code points")
532    print(len(extra_casing), "extended case array")
533
534    print("--- Writing", FILE, "...")
535
536    with open(FILE, "w") as fp:
537        fprint = partial(print, file=fp)
538
539        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
540        fprint()
541        fprint("/* a list of unique character type descriptors */")
542        fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
543        for item in table:
544            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
545        fprint("};")
546        fprint()
547
548        fprint("/* extended case mappings */")
549        fprint()
550        fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
551        for c in extra_casing:
552            fprint("    %d," % c)
553        fprint("};")
554        fprint()
555
556        # split decomposition index table
557        index1, index2, shift = splitbins(index, trace)
558
559        fprint("/* type indexes */")
560        fprint("#define SHIFT", shift)
561        Array("index1", index1).dump(fp, trace)
562        Array("index2", index2).dump(fp, trace)
563
564        # Generate code for _PyUnicode_ToNumeric()
565        numeric_items = sorted(numeric.items())
566        fprint('/* Returns the numeric value as double for Unicode characters')
567        fprint(' * having this property, -1.0 otherwise.')
568        fprint(' */')
569        fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
570        fprint('{')
571        fprint('    switch (ch) {')
572        for value, codepoints in numeric_items:
573            # Turn text into float literals
574            parts = value.split('/')
575            parts = [repr(float(part)) for part in parts]
576            value = '/'.join(parts)
577
578            codepoints.sort()
579            for codepoint in codepoints:
580                fprint('    case 0x%04X:' % (codepoint,))
581            fprint('        return (double) %s;' % (value,))
582        fprint('    }')
583        fprint('    return -1.0;')
584        fprint('}')
585        fprint()
586
587        # Generate code for _PyUnicode_IsWhitespace()
588        fprint("/* Returns 1 for Unicode characters having the bidirectional")
589        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
590        fprint(" */")
591        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
592        fprint('{')
593        fprint('    switch (ch) {')
594
595        for codepoint in sorted(spaces):
596            fprint('    case 0x%04X:' % (codepoint,))
597        fprint('        return 1;')
598
599        fprint('    }')
600        fprint('    return 0;')
601        fprint('}')
602        fprint()
603
604        # Generate code for _PyUnicode_IsLinebreak()
605        fprint("/* Returns 1 for Unicode characters having the line break")
606        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
607        fprint(" * type 'B', 0 otherwise.")
608        fprint(" */")
609        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
610        fprint('{')
611        fprint('    switch (ch) {')
612        for codepoint in sorted(linebreaks):
613            fprint('    case 0x%04X:' % (codepoint,))
614        fprint('        return 1;')
615
616        fprint('    }')
617        fprint('    return 0;')
618        fprint('}')
619        fprint()
620
621
622# --------------------------------------------------------------------
623# unicode name database
624
625def makeunicodename(unicode, trace):
626    from dawg import build_compression_dawg
627
628    FILE = "Modules/unicodename_db.h"
629
630    print("--- Preparing", FILE, "...")
631
632    # unicode name hash table
633
634    # extract names
635    data = []
636    for char in unicode.chars:
637        record = unicode.table[char]
638        if record:
639            name = record.name.strip()
640            if name and name[0] != "<":
641                data.append((name, char))
642
643    print("--- Writing", FILE, "...")
644
645    with open(FILE, "w") as fp:
646        fprint = partial(print, file=fp)
647
648        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
649        fprint()
650        fprint("#define NAME_MAXLEN", 256)
651        assert max(len(x) for x in data) < 256
652        fprint()
653
654        fprint("/* name->code dictionary */")
655        packed_dawg, pos_to_codepoint = build_compression_dawg(data)
656        notfound = len(pos_to_codepoint)
657        inverse_list = [notfound] * len(unicode.chars)
658        for pos, codepoint in enumerate(pos_to_codepoint):
659            inverse_list[codepoint] = pos
660        Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace)
661        Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace)
662        index1, index2, shift = splitbins(inverse_list, trace)
663        fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift)
664        fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound)
665        Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace)
666        Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace)
667
668        fprint()
669        fprint('static const unsigned int aliases_start = %#x;' %
670               NAME_ALIASES_START)
671        fprint('static const unsigned int aliases_end = %#x;' %
672               (NAME_ALIASES_START + len(unicode.aliases)))
673
674        fprint('static const unsigned int name_aliases[] = {')
675        for name, codepoint in unicode.aliases:
676            fprint('    0x%04X,' % codepoint)
677        fprint('};')
678
679        # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
680        # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
681        # sequences or sequences with non-BMP chars are added.
682        # unicodedata_lookup should be adapted too.
683        fprint(dedent("""
684            typedef struct NamedSequence {
685                int seqlen;
686                Py_UCS2 seq[4];
687            } named_sequence;
688            """))
689
690        fprint('static const unsigned int named_sequences_start = %#x;' %
691               NAMED_SEQUENCES_START)
692        fprint('static const unsigned int named_sequences_end = %#x;' %
693               (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
694
695        fprint('static const named_sequence named_sequences[] = {')
696        for name, sequence in unicode.named_sequences:
697            seq_str = ', '.join('0x%04X' % cp for cp in sequence)
698            fprint('    {%d, {%s}},' % (len(sequence), seq_str))
699        fprint('};')
700
701
702def merge_old_version(version, new, old):
703    # Changes to exclusion file not implemented yet
704    if old.exclusions != new.exclusions:
705        raise NotImplementedError("exclusions differ")
706
707    # In these change records, 0xFF means "no change"
708    bidir_changes = [0xFF]*0x110000
709    category_changes = [0xFF]*0x110000
710    decimal_changes = [0xFF]*0x110000
711    mirrored_changes = [0xFF]*0x110000
712    east_asian_width_changes = [0xFF]*0x110000
713    # In numeric data, 0 means "no change",
714    # -1 means "did not have a numeric value
715    numeric_changes = [0] * 0x110000
716    # normalization_changes is a list of key-value pairs
717    normalization_changes = []
718    for i in range(0x110000):
719        if new.table[i] is None:
720            # Characters unassigned in the new version ought to
721            # be unassigned in the old one
722            assert old.table[i] is None
723            continue
724        # check characters unassigned in the old version
725        if old.table[i] is None:
726            # category 0 is "unassigned"
727            category_changes[i] = 0
728            continue
729        # check characters that differ
730        if old.table[i] != new.table[i]:
731            for k, field in enumerate(dataclasses.fields(UcdRecord)):
732                value = getattr(old.table[i], field.name)
733                new_value = getattr(new.table[i], field.name)
734                if value != new_value:
735                    if k == 1 and i in PUA_15:
736                        # the name is not set in the old.table, but in the
737                        # new.table we are using it for aliases and named seq
738                        assert value == ''
739                    elif k == 2:
740                        category_changes[i] = CATEGORY_NAMES.index(value)
741                    elif k == 4:
742                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
743                    elif k == 5:
744                        # We assume that all normalization changes are in 1:1 mappings
745                        assert " " not in value
746                        normalization_changes.append((i, value))
747                    elif k == 6:
748                        # we only support changes where the old value is a single digit
749                        assert value in "0123456789"
750                        decimal_changes[i] = int(value)
751                    elif k == 8:
752                        # Since 0 encodes "no change", the old value is better not 0
753                        if not value:
754                            numeric_changes[i] = -1
755                        else:
756                            numeric_changes[i] = float(value)
757                            assert numeric_changes[i] not in (0, -1)
758                    elif k == 9:
759                        if value == 'Y':
760                            mirrored_changes[i] = '1'
761                        else:
762                            mirrored_changes[i] = '0'
763                    elif k == 11:
764                        # change to ISO comment, ignore
765                        pass
766                    elif k == 12:
767                        # change to simple uppercase mapping; ignore
768                        pass
769                    elif k == 13:
770                        # change to simple lowercase mapping; ignore
771                        pass
772                    elif k == 14:
773                        # change to simple titlecase mapping; ignore
774                        pass
775                    elif k == 15:
776                        # change to east asian width
777                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
778                    elif k == 16:
779                        # derived property changes; not yet
780                        pass
781                    elif k == 17:
782                        # normalization quickchecks are not performed
783                        # for older versions
784                        pass
785                    else:
786                        class Difference(Exception):pass
787                        raise Difference(hex(i), k, old.table[i], new.table[i])
788    new.changed.append((version, list(zip(bidir_changes, category_changes,
789                                          decimal_changes, mirrored_changes,
790                                          east_asian_width_changes,
791                                          numeric_changes)),
792                        normalization_changes))
793
794
795DATA_DIR = os.path.join('Tools', 'unicode', 'data')
796
797def open_data(template, version):
798    local = os.path.join(DATA_DIR, template % ('-'+version,))
799    if not os.path.exists(local):
800        import urllib.request
801        if version == '3.2.0':
802            # irregular url structure
803            url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
804        else:
805            url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
806        os.makedirs(DATA_DIR, exist_ok=True)
807        urllib.request.urlretrieve(url, filename=local)
808    if local.endswith('.txt'):
809        return open(local, encoding='utf-8')
810    else:
811        # Unihan.zip
812        return open(local, 'rb')
813
814
815def expand_range(char_range: str) -> Iterator[int]:
816    '''
817    Parses ranges of code points, as described in UAX #44:
818      https://www.unicode.org/reports/tr44/#Code_Point_Ranges
819    '''
820    if '..' in char_range:
821        first, last = [int(c, 16) for c in char_range.split('..')]
822    else:
823        first = last = int(char_range, 16)
824    for char in range(first, last+1):
825        yield char
826
827
828class UcdFile:
829    '''
830    A file in the standard format of the UCD.
831
832    See: https://www.unicode.org/reports/tr44/#Format_Conventions
833
834    Note that, as described there, the Unihan data files have their
835    own separate format.
836    '''
837
838    def __init__(self, template: str, version: str) -> None:
839        self.template = template
840        self.version = version
841
842    def records(self) -> Iterator[List[str]]:
843        with open_data(self.template, self.version) as file:
844            for line in file:
845                line = line.split('#', 1)[0].strip()
846                if not line:
847                    continue
848                yield [field.strip() for field in line.split(';')]
849
850    def __iter__(self) -> Iterator[List[str]]:
851        return self.records()
852
853    def expanded(self) -> Iterator[Tuple[int, List[str]]]:
854        for record in self.records():
855            char_range, rest = record[0], record[1:]
856            for char in expand_range(char_range):
857                yield char, rest
858
859
860@dataclasses.dataclass
861class UcdRecord:
862    # 15 fields from UnicodeData.txt .  See:
863    #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
864    codepoint: str
865    name: str
866    general_category: str
867    canonical_combining_class: str
868    bidi_class: str
869    decomposition_type: str
870    decomposition_mapping: str
871    numeric_type: str
872    numeric_value: str
873    bidi_mirrored: str
874    unicode_1_name: str  # obsolete
875    iso_comment: str  # obsolete
876    simple_uppercase_mapping: str
877    simple_lowercase_mapping: str
878    simple_titlecase_mapping: str
879
880    # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
881    east_asian_width: Optional[str]
882
883    # Binary properties, as a set of those that are true.
884    # Taken from multiple files:
885    #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
886    #   https://www.unicode.org/reports/tr44/#LineBreak.txt
887    binary_properties: Set[str]
888
889    # The Quick_Check properties related to normalization:
890    #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
891    # We store them as a bitmask.
892    quick_check: int
893
894
895def from_row(row: List[str]) -> UcdRecord:
896    return UcdRecord(*row, None, set(), 0)
897
898
899# --------------------------------------------------------------------
900# the following support code is taken from the unidb utilities
901# Copyright (c) 1999-2000 by Secret Labs AB
902
903# load a unicode-data file from disk
904
905class UnicodeData:
906    # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
907
908    def __init__(self, version, cjk_check=True):
909        self.changed = []
910        table = [None] * 0x110000
911        for s in UcdFile(UNICODE_DATA, version):
912            char = int(s[0], 16)
913            table[char] = from_row(s)
914
915        cjk_ranges_found = []
916
917        # expand first-last ranges
918        field = None
919        for i in range(0, 0x110000):
920            # The file UnicodeData.txt has its own distinct way of
921            # expressing ranges.  See:
922            #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges
923            s = table[i]
924            if s:
925                if s.name[-6:] == "First>":
926                    s.name = ""
927                    field = dataclasses.astuple(s)[:15]
928                elif s.name[-5:] == "Last>":
929                    if s.name.startswith("<CJK Ideograph"):
930                        cjk_ranges_found.append((field[0],
931                                                 s.codepoint))
932                    s.name = ""
933                    field = None
934            elif field:
935                table[i] = from_row(('%X' % i,) + field[1:])
936        if cjk_check and cjk_ranges != cjk_ranges_found:
937            raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
938
939        # public attributes
940        self.filename = UNICODE_DATA % ''
941        self.table = table
942        self.chars = list(range(0x110000)) # unicode 3.2
943
944        # check for name aliases and named sequences, see #12753
945        # aliases and named sequences are not in 3.2.0
946        if version != '3.2.0':
947            self.aliases = []
948            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
949            # in order to take advantage of the compression and lookup
950            # algorithms used for the other characters
951            pua_index = NAME_ALIASES_START
952            for char, name, abbrev in UcdFile(NAME_ALIASES, version):
953                char = int(char, 16)
954                self.aliases.append((name, char))
955                # also store the name in the PUA 1
956                self.table[pua_index].name = name
957                pua_index += 1
958            assert pua_index - NAME_ALIASES_START == len(self.aliases)
959
960            self.named_sequences = []
961            # store named sequences in the PUA 1, in range U+F0100..,
962            # in order to take advantage of the compression and lookup
963            # algorithms used for the other characters.
964
965            assert pua_index < NAMED_SEQUENCES_START
966            pua_index = NAMED_SEQUENCES_START
967            for name, chars in UcdFile(NAMED_SEQUENCES, version):
968                chars = tuple(int(char, 16) for char in chars.split())
969                # check that the structure defined in makeunicodename is OK
970                assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
971                assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
972                    "the NamedSequence struct and in unicodedata_lookup")
973                self.named_sequences.append((name, chars))
974                # also store these in the PUA 1
975                self.table[pua_index].name = name
976                pua_index += 1
977            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
978
979        self.exclusions = {}
980        for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
981            char = int(char, 16)
982            self.exclusions[char] = 1
983
984        widths = [None] * 0x110000
985        for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
986            widths[char] = width
987
988        for i in range(0, 0x110000):
989            if table[i] is not None:
990                table[i].east_asian_width = widths[i]
991        self.widths = widths
992
993        for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
994            if propinfo:
995                # this is not a binary property, ignore it
996                continue
997
998            if table[char]:
999                # Some properties (e.g. Default_Ignorable_Code_Point)
1000                # apply to unassigned code points; ignore them
1001                table[char].binary_properties.add(propname)
1002
1003        for char_range, value in UcdFile(LINE_BREAK, version):
1004            if value not in MANDATORY_LINE_BREAKS:
1005                continue
1006            for char in expand_range(char_range):
1007                table[char].binary_properties.add('Line_Break')
1008
1009        # We only want the quickcheck properties
1010        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1011        # Yes is the default, hence only N and M occur
1012        # In 3.2.0, the format was different (NF?_NO)
1013        # The parsing will incorrectly determine these as
1014        # "yes", however, unicodedata.c will not perform quickchecks
1015        # for older versions, and no delta records will be created.
1016        quickchecks = [0] * 0x110000
1017        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1018        for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1019            if len(s) < 2 or s[1] not in qc_order:
1020                continue
1021            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1022            quickcheck_shift = qc_order.index(s[1])*2
1023            quickcheck <<= quickcheck_shift
1024            for char in expand_range(s[0]):
1025                assert not (quickchecks[char]>>quickcheck_shift)&3
1026                quickchecks[char] |= quickcheck
1027        for i in range(0, 0x110000):
1028            if table[i] is not None:
1029                table[i].quick_check = quickchecks[i]
1030
1031        with open_data(UNIHAN, version) as file:
1032            zip = zipfile.ZipFile(file)
1033            if version == '3.2.0':
1034                data = zip.open('Unihan-3.2.0.txt').read()
1035            else:
1036                data = zip.open('Unihan_NumericValues.txt').read()
1037        for line in data.decode("utf-8").splitlines():
1038            if not line.startswith('U+'):
1039                continue
1040            code, tag, value = line.split(None, 3)[:3]
1041            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1042                           'kOtherNumeric'):
1043                continue
1044            value = value.strip().replace(',', '')
1045            i = int(code[2:], 16)
1046            # Patch the numeric field
1047            if table[i] is not None:
1048                table[i].numeric_value = value
1049
1050        sc = self.special_casing = {}
1051        for data in UcdFile(SPECIAL_CASING, version):
1052            if data[4]:
1053                # We ignore all conditionals (since they depend on
1054                # languages) except for one, which is hardcoded. See
1055                # handle_capital_sigma in unicodeobject.c.
1056                continue
1057            c = int(data[0], 16)
1058            lower = [int(char, 16) for char in data[1].split()]
1059            title = [int(char, 16) for char in data[2].split()]
1060            upper = [int(char, 16) for char in data[3].split()]
1061            sc[c] = (lower, title, upper)
1062
1063        cf = self.case_folding = {}
1064        if version != '3.2.0':
1065            for data in UcdFile(CASE_FOLDING, version):
1066                if data[1] in "CF":
1067                    c = int(data[0], 16)
1068                    cf[c] = [int(char, 16) for char in data[2].split()]
1069
1070    def uselatin1(self):
1071        # restrict character range to ISO Latin 1
1072        self.chars = list(range(256))
1073
1074
1075
1076# stuff to deal with arrays of unsigned integers
1077
1078class Array:
1079
1080    def __init__(self, name, data):
1081        self.name = name
1082        self.data = data
1083
1084    def dump(self, file, trace=0):
1085        # write data to file, as a C array
1086        size = getsize(self.data)
1087        if trace:
1088            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1089        file.write("static const ")
1090        if size == 1:
1091            file.write("unsigned char")
1092        elif size == 2:
1093            file.write("unsigned short")
1094        else:
1095            file.write("unsigned int")
1096        file.write(" " + self.name + "[] = {\n")
1097        if self.data:
1098            s = "    "
1099            for item in self.data:
1100                i = str(item) + ", "
1101                if len(s) + len(i) > 78:
1102                    file.write(s.rstrip() + "\n")
1103                    s = "    " + i
1104                else:
1105                    s = s + i
1106            if s.strip():
1107                file.write(s.rstrip() + "\n")
1108        file.write("};\n\n")
1109
1110
1111def getsize(data):
1112    # return smallest possible integer size for the given array
1113    maxdata = max(data)
1114    if maxdata < 256:
1115        return 1
1116    elif maxdata < 65536:
1117        return 2
1118    else:
1119        return 4
1120
1121
1122def splitbins(t, trace=0):
1123    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1124
1125    t is a sequence of ints.  This function can be useful to save space if
1126    many of the ints are the same.  t1 and t2 are lists of ints, and shift
1127    is an int, chosen to minimize the combined size of t1 and t2 (in C
1128    code), and where for each i in range(len(t)),
1129        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1130    where mask is a bitmask isolating the last "shift" bits.
1131
1132    If optional arg trace is non-zero (default zero), progress info
1133    is printed to sys.stderr.  The higher the value, the more info
1134    you'll get.
1135    """
1136
1137    if trace:
1138        def dump(t1, t2, shift, bytes):
1139            print("%d+%d bins at shift %d; %d bytes" % (
1140                len(t1), len(t2), shift, bytes), file=sys.stderr)
1141        print("Size of original table:", len(t)*getsize(t), "bytes",
1142              file=sys.stderr)
1143    n = len(t)-1    # last valid index
1144    maxshift = 0    # the most we can shift n and still have something left
1145    if n > 0:
1146        while n >> 1:
1147            n >>= 1
1148            maxshift += 1
1149    del n
1150    bytes = sys.maxsize  # smallest total size so far
1151    t = tuple(t)    # so slices can be dict keys
1152    for shift in range(maxshift + 1):
1153        t1 = []
1154        t2 = []
1155        size = 2**shift
1156        bincache = {}
1157        for i in range(0, len(t), size):
1158            bin = t[i:i+size]
1159            index = bincache.get(bin)
1160            if index is None:
1161                index = len(t2)
1162                bincache[bin] = index
1163                t2.extend(bin)
1164            t1.append(index >> shift)
1165        # determine memory size
1166        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1167        if trace > 1:
1168            dump(t1, t2, shift, b)
1169        if b < bytes:
1170            best = t1, t2, shift
1171            bytes = b
1172    t1, t2, shift = best
1173    if trace:
1174        print("Best:", end=' ', file=sys.stderr)
1175        dump(t1, t2, shift, bytes)
1176    if __debug__:
1177        # exhaustively verify that the decomposition is correct
1178        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1179        for i in range(len(t)):
1180            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1181    return best
1182
1183
1184if __name__ == "__main__":
1185    maketables(1)
1186