1# 2# (re)generate unicode property and type databases 3# 4# This script converts Unicode database files to Modules/unicodedata_db.h, 5# Modules/unicodename_db.h, and Objects/unicodetype_db.h 6# 7# history: 8# 2000-09-24 fl created (based on bits and pieces from unidb) 9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 10# 2000-09-25 fl added character type table 11# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 12# 2000-11-03 fl expand first/last ranges 13# 2001-01-19 fl added character name tables (2.1) 14# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 15# 2002-09-11 wd use string methods 16# 2002-10-18 mvl update to Unicode 3.2 17# 2002-10-22 mvl generate NFC tables 18# 2002-11-24 mvl expand all ranges, sort names version-independently 19# 2002-11-25 mvl add UNIDATA_VERSION 20# 2004-05-29 perky add east asian width information 21# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta 22# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch 23# 2011-10-21 ezio add support for name aliases and named sequences 24# 2012-01 benjamin add full case mappings 25# 26# written by Fredrik Lundh (fredrik@pythonware.com) 27# 28 29import dataclasses 30import os 31import sys 32import zipfile 33 34from functools import partial 35from textwrap import dedent 36from typing import Iterator, List, Optional, Set, Tuple 37 38SCRIPT = os.path.normpath(sys.argv[0]) 39VERSION = "3.3" 40 41# The Unicode Database 42# -------------------- 43# When changing UCD version please update 44# * Doc/library/stdtypes.rst, and 45# * Doc/library/unicodedata.rst 46# * Doc/reference/lexical_analysis.rst (two occurrences) 47UNIDATA_VERSION = "15.1.0" 48UNICODE_DATA = "UnicodeData%s.txt" 49COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 50EASTASIAN_WIDTH = "EastAsianWidth%s.txt" 51UNIHAN = "Unihan%s.zip" 52DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" 53DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 54LINE_BREAK = "LineBreak%s.txt" 55NAME_ALIASES = "NameAliases%s.txt" 56NAMED_SEQUENCES = "NamedSequences%s.txt" 57SPECIAL_CASING = "SpecialCasing%s.txt" 58CASE_FOLDING = "CaseFolding%s.txt" 59 60# Private Use Areas -- in planes 1, 15, 16 61PUA_1 = range(0xE000, 0xF900) 62PUA_15 = range(0xF0000, 0xFFFFE) 63PUA_16 = range(0x100000, 0x10FFFE) 64 65# we use this ranges of PUA_15 to store name aliases and named sequences 66NAME_ALIASES_START = 0xF0000 67NAMED_SEQUENCES_START = 0xF0200 68 69old_versions = ["3.2.0"] 70 71CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 72 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 73 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 74 "So" ] 75 76BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 77 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 78 "ON", "LRI", "RLI", "FSI", "PDI" ] 79 80# "N" needs to be the first entry, see the comment in makeunicodedata 81EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ] 82 83MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 84 85# note: should match definitions in Objects/unicodectype.c 86ALPHA_MASK = 0x01 87DECIMAL_MASK = 0x02 88DIGIT_MASK = 0x04 89LOWER_MASK = 0x08 90LINEBREAK_MASK = 0x10 91SPACE_MASK = 0x20 92TITLE_MASK = 0x40 93UPPER_MASK = 0x80 94XID_START_MASK = 0x100 95XID_CONTINUE_MASK = 0x200 96PRINTABLE_MASK = 0x400 97NUMERIC_MASK = 0x800 98CASE_IGNORABLE_MASK = 0x1000 99CASED_MASK = 0x2000 100EXTENDED_CASE_MASK = 0x4000 101 102# these ranges need to match unicodedata.c:is_unified_ideograph 103cjk_ranges = [ 104 ('3400', '4DBF'), # CJK Ideograph Extension A CJK 105 ('4E00', '9FFF'), # CJK Ideograph 106 ('20000', '2A6DF'), # CJK Ideograph Extension B 107 ('2A700', '2B739'), # CJK Ideograph Extension C 108 ('2B740', '2B81D'), # CJK Ideograph Extension D 109 ('2B820', '2CEA1'), # CJK Ideograph Extension E 110 ('2CEB0', '2EBE0'), # CJK Ideograph Extension F 111 ('2EBF0', '2EE5D'), # CJK Ideograph Extension I 112 ('30000', '3134A'), # CJK Ideograph Extension G 113 ('31350', '323AF'), # CJK Ideograph Extension H 114] 115 116 117def maketables(trace=0): 118 119 print("--- Reading", UNICODE_DATA % "", "...") 120 121 unicode = UnicodeData(UNIDATA_VERSION) 122 123 print(len(list(filter(None, unicode.table))), "characters") 124 125 for version in old_versions: 126 print("--- Reading", UNICODE_DATA % ("-"+version), "...") 127 old_unicode = UnicodeData(version, cjk_check=False) 128 print(len(list(filter(None, old_unicode.table))), "characters") 129 merge_old_version(version, unicode, old_unicode) 130 131 makeunicodename(unicode, trace) 132 makeunicodedata(unicode, trace) 133 makeunicodetype(unicode, trace) 134 135 136# -------------------------------------------------------------------- 137# unicode character properties 138 139def makeunicodedata(unicode, trace): 140 141 # the default value of east_asian_width is "N", for unassigned code points 142 # not mentioned in EastAsianWidth.txt 143 # in addition there are some reserved but unassigned code points in CJK 144 # ranges that are classified as "W". code points in private use areas 145 # have a width of "A". both of these have entries in 146 # EastAsianWidth.txt 147 # see https://unicode.org/reports/tr11/#Unassigned 148 assert EASTASIANWIDTH_NAMES[0] == "N" 149 dummy = (0, 0, 0, 0, 0, 0) 150 table = [dummy] 151 cache = {0: dummy} 152 index = [0] * len(unicode.chars) 153 154 FILE = "Modules/unicodedata_db.h" 155 156 print("--- Preparing", FILE, "...") 157 158 # 1) database properties 159 160 for char in unicode.chars: 161 record = unicode.table[char] 162 if record: 163 # extract database properties 164 category = CATEGORY_NAMES.index(record.general_category) 165 combining = int(record.canonical_combining_class) 166 bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class) 167 mirrored = record.bidi_mirrored == "Y" 168 eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) 169 normalizationquickcheck = record.quick_check 170 item = ( 171 category, combining, bidirectional, mirrored, eastasianwidth, 172 normalizationquickcheck 173 ) 174 elif unicode.widths[char] is not None: 175 # an unassigned but reserved character, with a known 176 # east_asian_width 177 eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char]) 178 item = (0, 0, 0, 0, eastasianwidth, 0) 179 else: 180 continue 181 182 # add entry to index and item tables 183 i = cache.get(item) 184 if i is None: 185 cache[item] = i = len(table) 186 table.append(item) 187 index[char] = i 188 189 # 2) decomposition data 190 191 decomp_data_cache = {} 192 decomp_data = [0] 193 decomp_prefix = [""] 194 decomp_index = [0] * len(unicode.chars) 195 decomp_size = 0 196 197 comp_pairs = [] 198 comp_first = [None] * len(unicode.chars) 199 comp_last = [None] * len(unicode.chars) 200 201 for char in unicode.chars: 202 record = unicode.table[char] 203 if record: 204 if record.decomposition_type: 205 decomp = record.decomposition_type.split() 206 if len(decomp) > 19: 207 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) 208 # prefix 209 if decomp[0][0] == "<": 210 prefix = decomp.pop(0) 211 else: 212 prefix = "" 213 try: 214 i = decomp_prefix.index(prefix) 215 except ValueError: 216 i = len(decomp_prefix) 217 decomp_prefix.append(prefix) 218 prefix = i 219 assert prefix < 256 220 # content 221 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 222 # Collect NFC pairs 223 if not prefix and len(decomp) == 3 and \ 224 char not in unicode.exclusions and \ 225 unicode.table[decomp[1]].canonical_combining_class == "0": 226 p, l, r = decomp 227 comp_first[l] = 1 228 comp_last[r] = 1 229 comp_pairs.append((l,r,char)) 230 key = tuple(decomp) 231 i = decomp_data_cache.get(key, -1) 232 if i == -1: 233 i = len(decomp_data) 234 decomp_data.extend(decomp) 235 decomp_size = decomp_size + len(decomp) * 2 236 decomp_data_cache[key] = i 237 else: 238 assert decomp_data[i:i+len(decomp)] == decomp 239 else: 240 i = 0 241 decomp_index[char] = i 242 243 f = l = 0 244 comp_first_ranges = [] 245 comp_last_ranges = [] 246 prev_f = prev_l = None 247 for i in unicode.chars: 248 if comp_first[i] is not None: 249 comp_first[i] = f 250 f += 1 251 if prev_f is None: 252 prev_f = (i,i) 253 elif prev_f[1]+1 == i: 254 prev_f = prev_f[0],i 255 else: 256 comp_first_ranges.append(prev_f) 257 prev_f = (i,i) 258 if comp_last[i] is not None: 259 comp_last[i] = l 260 l += 1 261 if prev_l is None: 262 prev_l = (i,i) 263 elif prev_l[1]+1 == i: 264 prev_l = prev_l[0],i 265 else: 266 comp_last_ranges.append(prev_l) 267 prev_l = (i,i) 268 comp_first_ranges.append(prev_f) 269 comp_last_ranges.append(prev_l) 270 total_first = f 271 total_last = l 272 273 comp_data = [0]*(total_first*total_last) 274 for f,l,char in comp_pairs: 275 f = comp_first[f] 276 l = comp_last[l] 277 comp_data[f*total_last+l] = char 278 279 print(len(table), "unique properties") 280 print(len(decomp_prefix), "unique decomposition prefixes") 281 print(len(decomp_data), "unique decomposition entries:", end=' ') 282 print(decomp_size, "bytes") 283 print(total_first, "first characters in NFC") 284 print(total_last, "last characters in NFC") 285 print(len(comp_pairs), "NFC pairs") 286 287 print("--- Writing", FILE, "...") 288 289 with open(FILE, "w") as fp: 290 fprint = partial(print, file=fp) 291 292 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 293 fprint() 294 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION) 295 fprint("/* a list of unique database records */") 296 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {") 297 for item in table: 298 fprint(" {%d, %d, %d, %d, %d, %d}," % item) 299 fprint("};") 300 fprint() 301 302 fprint("/* Reindexing of NFC first characters. */") 303 fprint("#define TOTAL_FIRST",total_first) 304 fprint("#define TOTAL_LAST",total_last) 305 fprint("struct reindex{int start;short count,index;};") 306 fprint("static struct reindex nfc_first[] = {") 307 for start,end in comp_first_ranges: 308 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start])) 309 fprint(" {0,0,0}") 310 fprint("};\n") 311 fprint("static struct reindex nfc_last[] = {") 312 for start,end in comp_last_ranges: 313 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start])) 314 fprint(" {0,0,0}") 315 fprint("};\n") 316 317 # FIXME: <fl> the following tables could be made static, and 318 # the support code moved into unicodedatabase.c 319 320 fprint("/* string literals */") 321 fprint("const char *_PyUnicode_CategoryNames[] = {") 322 for name in CATEGORY_NAMES: 323 fprint(" \"%s\"," % name) 324 fprint(" NULL") 325 fprint("};") 326 327 fprint("const char *_PyUnicode_BidirectionalNames[] = {") 328 for name in BIDIRECTIONAL_NAMES: 329 fprint(" \"%s\"," % name) 330 fprint(" NULL") 331 fprint("};") 332 333 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {") 334 for name in EASTASIANWIDTH_NAMES: 335 fprint(" \"%s\"," % name) 336 fprint(" NULL") 337 fprint("};") 338 339 fprint("static const char *decomp_prefix[] = {") 340 for name in decomp_prefix: 341 fprint(" \"%s\"," % name) 342 fprint(" NULL") 343 fprint("};") 344 345 # split record index table 346 index1, index2, shift = splitbins(index, trace) 347 348 fprint("/* index tables for the database records */") 349 fprint("#define SHIFT", shift) 350 Array("index1", index1).dump(fp, trace) 351 Array("index2", index2).dump(fp, trace) 352 353 # split decomposition index table 354 index1, index2, shift = splitbins(decomp_index, trace) 355 356 fprint("/* decomposition data */") 357 Array("decomp_data", decomp_data).dump(fp, trace) 358 359 fprint("/* index tables for the decomposition data */") 360 fprint("#define DECOMP_SHIFT", shift) 361 Array("decomp_index1", index1).dump(fp, trace) 362 Array("decomp_index2", index2).dump(fp, trace) 363 364 index, index2, shift = splitbins(comp_data, trace) 365 fprint("/* NFC pairs */") 366 fprint("#define COMP_SHIFT", shift) 367 Array("comp_index", index).dump(fp, trace) 368 Array("comp_data", index2).dump(fp, trace) 369 370 # Generate delta tables for old versions 371 for version, table, normalization in unicode.changed: 372 cversion = version.replace(".","_") 373 records = [table[0]] 374 cache = {table[0]:0} 375 index = [0] * len(table) 376 for i, record in enumerate(table): 377 try: 378 index[i] = cache[record] 379 except KeyError: 380 index[i] = cache[record] = len(records) 381 records.append(record) 382 index1, index2, shift = splitbins(index, trace) 383 fprint("static const change_record change_records_%s[] = {" % cversion) 384 for record in records: 385 fprint(" { %s }," % ", ".join(map(str,record))) 386 fprint("};") 387 Array("changes_%s_index" % cversion, index1).dump(fp, trace) 388 Array("changes_%s_data" % cversion, index2).dump(fp, trace) 389 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion) 390 fprint("{") 391 fprint(" int index;") 392 fprint(" if (n >= 0x110000) index = 0;") 393 fprint(" else {") 394 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift)) 395 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \ 396 (cversion, shift, ((1<<shift)-1))) 397 fprint(" }") 398 fprint(" return change_records_%s+index;" % cversion) 399 fprint("}\n") 400 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion) 401 fprint("{") 402 fprint(" switch(n) {") 403 for k, v in normalization: 404 fprint(" case %s: return 0x%s;" % (hex(k), v)) 405 fprint(" default: return 0;") 406 fprint(" }\n}\n") 407 408 409# -------------------------------------------------------------------- 410# unicode character type tables 411 412def makeunicodetype(unicode, trace): 413 414 FILE = "Objects/unicodetype_db.h" 415 416 print("--- Preparing", FILE, "...") 417 418 # extract unicode types 419 dummy = (0, 0, 0, 0, 0, 0) 420 table = [dummy] 421 cache = {dummy: 0} 422 index = [0] * len(unicode.chars) 423 numeric = {} 424 spaces = [] 425 linebreaks = [] 426 extra_casing = [] 427 428 for char in unicode.chars: 429 record = unicode.table[char] 430 if record: 431 # extract database properties 432 category = record.general_category 433 bidirectional = record.bidi_class 434 properties = record.binary_properties 435 flags = 0 436 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 437 flags |= ALPHA_MASK 438 if "Lowercase" in properties: 439 flags |= LOWER_MASK 440 if 'Line_Break' in properties or bidirectional == "B": 441 flags |= LINEBREAK_MASK 442 linebreaks.append(char) 443 if category == "Zs" or bidirectional in ("WS", "B", "S"): 444 flags |= SPACE_MASK 445 spaces.append(char) 446 if category == "Lt": 447 flags |= TITLE_MASK 448 if "Uppercase" in properties: 449 flags |= UPPER_MASK 450 if char == ord(" ") or category[0] not in ("C", "Z"): 451 flags |= PRINTABLE_MASK 452 if "XID_Start" in properties: 453 flags |= XID_START_MASK 454 if "XID_Continue" in properties: 455 flags |= XID_CONTINUE_MASK 456 if "Cased" in properties: 457 flags |= CASED_MASK 458 if "Case_Ignorable" in properties: 459 flags |= CASE_IGNORABLE_MASK 460 sc = unicode.special_casing.get(char) 461 cf = unicode.case_folding.get(char, [char]) 462 if record.simple_uppercase_mapping: 463 upper = int(record.simple_uppercase_mapping, 16) 464 else: 465 upper = char 466 if record.simple_lowercase_mapping: 467 lower = int(record.simple_lowercase_mapping, 16) 468 else: 469 lower = char 470 if record.simple_titlecase_mapping: 471 title = int(record.simple_titlecase_mapping, 16) 472 else: 473 title = upper 474 if sc is None and cf != [lower]: 475 sc = ([lower], [title], [upper]) 476 if sc is None: 477 if upper == lower == title: 478 upper = lower = title = 0 479 else: 480 upper = upper - char 481 lower = lower - char 482 title = title - char 483 assert (abs(upper) <= 2147483647 and 484 abs(lower) <= 2147483647 and 485 abs(title) <= 2147483647) 486 else: 487 # This happens either when some character maps to more than one 488 # character in uppercase, lowercase, or titlecase or the 489 # casefolded version of the character is different from the 490 # lowercase. The extra characters are stored in a different 491 # array. 492 flags |= EXTENDED_CASE_MASK 493 lower = len(extra_casing) | (len(sc[0]) << 24) 494 extra_casing.extend(sc[0]) 495 if cf != sc[0]: 496 lower |= len(cf) << 20 497 extra_casing.extend(cf) 498 upper = len(extra_casing) | (len(sc[2]) << 24) 499 extra_casing.extend(sc[2]) 500 # Title is probably equal to upper. 501 if sc[1] == sc[2]: 502 title = upper 503 else: 504 title = len(extra_casing) | (len(sc[1]) << 24) 505 extra_casing.extend(sc[1]) 506 # decimal digit, integer digit 507 decimal = 0 508 if record.decomposition_mapping: 509 flags |= DECIMAL_MASK 510 decimal = int(record.decomposition_mapping) 511 digit = 0 512 if record.numeric_type: 513 flags |= DIGIT_MASK 514 digit = int(record.numeric_type) 515 if record.numeric_value: 516 flags |= NUMERIC_MASK 517 numeric.setdefault(record.numeric_value, []).append(char) 518 item = ( 519 upper, lower, title, decimal, digit, flags 520 ) 521 # add entry to index and item tables 522 i = cache.get(item) 523 if i is None: 524 cache[item] = i = len(table) 525 table.append(item) 526 index[char] = i 527 528 print(len(table), "unique character type entries") 529 print(sum(map(len, numeric.values())), "numeric code points") 530 print(len(spaces), "whitespace code points") 531 print(len(linebreaks), "linebreak code points") 532 print(len(extra_casing), "extended case array") 533 534 print("--- Writing", FILE, "...") 535 536 with open(FILE, "w") as fp: 537 fprint = partial(print, file=fp) 538 539 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 540 fprint() 541 fprint("/* a list of unique character type descriptors */") 542 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {") 543 for item in table: 544 fprint(" {%d, %d, %d, %d, %d, %d}," % item) 545 fprint("};") 546 fprint() 547 548 fprint("/* extended case mappings */") 549 fprint() 550 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {") 551 for c in extra_casing: 552 fprint(" %d," % c) 553 fprint("};") 554 fprint() 555 556 # split decomposition index table 557 index1, index2, shift = splitbins(index, trace) 558 559 fprint("/* type indexes */") 560 fprint("#define SHIFT", shift) 561 Array("index1", index1).dump(fp, trace) 562 Array("index2", index2).dump(fp, trace) 563 564 # Generate code for _PyUnicode_ToNumeric() 565 numeric_items = sorted(numeric.items()) 566 fprint('/* Returns the numeric value as double for Unicode characters') 567 fprint(' * having this property, -1.0 otherwise.') 568 fprint(' */') 569 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)') 570 fprint('{') 571 fprint(' switch (ch) {') 572 for value, codepoints in numeric_items: 573 # Turn text into float literals 574 parts = value.split('/') 575 parts = [repr(float(part)) for part in parts] 576 value = '/'.join(parts) 577 578 codepoints.sort() 579 for codepoint in codepoints: 580 fprint(' case 0x%04X:' % (codepoint,)) 581 fprint(' return (double) %s;' % (value,)) 582 fprint(' }') 583 fprint(' return -1.0;') 584 fprint('}') 585 fprint() 586 587 # Generate code for _PyUnicode_IsWhitespace() 588 fprint("/* Returns 1 for Unicode characters having the bidirectional") 589 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.") 590 fprint(" */") 591 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)') 592 fprint('{') 593 fprint(' switch (ch) {') 594 595 for codepoint in sorted(spaces): 596 fprint(' case 0x%04X:' % (codepoint,)) 597 fprint(' return 1;') 598 599 fprint(' }') 600 fprint(' return 0;') 601 fprint('}') 602 fprint() 603 604 # Generate code for _PyUnicode_IsLinebreak() 605 fprint("/* Returns 1 for Unicode characters having the line break") 606 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") 607 fprint(" * type 'B', 0 otherwise.") 608 fprint(" */") 609 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') 610 fprint('{') 611 fprint(' switch (ch) {') 612 for codepoint in sorted(linebreaks): 613 fprint(' case 0x%04X:' % (codepoint,)) 614 fprint(' return 1;') 615 616 fprint(' }') 617 fprint(' return 0;') 618 fprint('}') 619 fprint() 620 621 622# -------------------------------------------------------------------- 623# unicode name database 624 625def makeunicodename(unicode, trace): 626 from dawg import build_compression_dawg 627 628 FILE = "Modules/unicodename_db.h" 629 630 print("--- Preparing", FILE, "...") 631 632 # unicode name hash table 633 634 # extract names 635 data = [] 636 for char in unicode.chars: 637 record = unicode.table[char] 638 if record: 639 name = record.name.strip() 640 if name and name[0] != "<": 641 data.append((name, char)) 642 643 print("--- Writing", FILE, "...") 644 645 with open(FILE, "w") as fp: 646 fprint = partial(print, file=fp) 647 648 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 649 fprint() 650 fprint("#define NAME_MAXLEN", 256) 651 assert max(len(x) for x in data) < 256 652 fprint() 653 654 fprint("/* name->code dictionary */") 655 packed_dawg, pos_to_codepoint = build_compression_dawg(data) 656 notfound = len(pos_to_codepoint) 657 inverse_list = [notfound] * len(unicode.chars) 658 for pos, codepoint in enumerate(pos_to_codepoint): 659 inverse_list[codepoint] = pos 660 Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace) 661 Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace) 662 index1, index2, shift = splitbins(inverse_list, trace) 663 fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift) 664 fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound) 665 Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace) 666 Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace) 667 668 fprint() 669 fprint('static const unsigned int aliases_start = %#x;' % 670 NAME_ALIASES_START) 671 fprint('static const unsigned int aliases_end = %#x;' % 672 (NAME_ALIASES_START + len(unicode.aliases))) 673 674 fprint('static const unsigned int name_aliases[] = {') 675 for name, codepoint in unicode.aliases: 676 fprint(' 0x%04X,' % codepoint) 677 fprint('};') 678 679 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, 680 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer 681 # sequences or sequences with non-BMP chars are added. 682 # unicodedata_lookup should be adapted too. 683 fprint(dedent(""" 684 typedef struct NamedSequence { 685 int seqlen; 686 Py_UCS2 seq[4]; 687 } named_sequence; 688 """)) 689 690 fprint('static const unsigned int named_sequences_start = %#x;' % 691 NAMED_SEQUENCES_START) 692 fprint('static const unsigned int named_sequences_end = %#x;' % 693 (NAMED_SEQUENCES_START + len(unicode.named_sequences))) 694 695 fprint('static const named_sequence named_sequences[] = {') 696 for name, sequence in unicode.named_sequences: 697 seq_str = ', '.join('0x%04X' % cp for cp in sequence) 698 fprint(' {%d, {%s}},' % (len(sequence), seq_str)) 699 fprint('};') 700 701 702def merge_old_version(version, new, old): 703 # Changes to exclusion file not implemented yet 704 if old.exclusions != new.exclusions: 705 raise NotImplementedError("exclusions differ") 706 707 # In these change records, 0xFF means "no change" 708 bidir_changes = [0xFF]*0x110000 709 category_changes = [0xFF]*0x110000 710 decimal_changes = [0xFF]*0x110000 711 mirrored_changes = [0xFF]*0x110000 712 east_asian_width_changes = [0xFF]*0x110000 713 # In numeric data, 0 means "no change", 714 # -1 means "did not have a numeric value 715 numeric_changes = [0] * 0x110000 716 # normalization_changes is a list of key-value pairs 717 normalization_changes = [] 718 for i in range(0x110000): 719 if new.table[i] is None: 720 # Characters unassigned in the new version ought to 721 # be unassigned in the old one 722 assert old.table[i] is None 723 continue 724 # check characters unassigned in the old version 725 if old.table[i] is None: 726 # category 0 is "unassigned" 727 category_changes[i] = 0 728 continue 729 # check characters that differ 730 if old.table[i] != new.table[i]: 731 for k, field in enumerate(dataclasses.fields(UcdRecord)): 732 value = getattr(old.table[i], field.name) 733 new_value = getattr(new.table[i], field.name) 734 if value != new_value: 735 if k == 1 and i in PUA_15: 736 # the name is not set in the old.table, but in the 737 # new.table we are using it for aliases and named seq 738 assert value == '' 739 elif k == 2: 740 category_changes[i] = CATEGORY_NAMES.index(value) 741 elif k == 4: 742 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) 743 elif k == 5: 744 # We assume that all normalization changes are in 1:1 mappings 745 assert " " not in value 746 normalization_changes.append((i, value)) 747 elif k == 6: 748 # we only support changes where the old value is a single digit 749 assert value in "0123456789" 750 decimal_changes[i] = int(value) 751 elif k == 8: 752 # Since 0 encodes "no change", the old value is better not 0 753 if not value: 754 numeric_changes[i] = -1 755 else: 756 numeric_changes[i] = float(value) 757 assert numeric_changes[i] not in (0, -1) 758 elif k == 9: 759 if value == 'Y': 760 mirrored_changes[i] = '1' 761 else: 762 mirrored_changes[i] = '0' 763 elif k == 11: 764 # change to ISO comment, ignore 765 pass 766 elif k == 12: 767 # change to simple uppercase mapping; ignore 768 pass 769 elif k == 13: 770 # change to simple lowercase mapping; ignore 771 pass 772 elif k == 14: 773 # change to simple titlecase mapping; ignore 774 pass 775 elif k == 15: 776 # change to east asian width 777 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value) 778 elif k == 16: 779 # derived property changes; not yet 780 pass 781 elif k == 17: 782 # normalization quickchecks are not performed 783 # for older versions 784 pass 785 else: 786 class Difference(Exception):pass 787 raise Difference(hex(i), k, old.table[i], new.table[i]) 788 new.changed.append((version, list(zip(bidir_changes, category_changes, 789 decimal_changes, mirrored_changes, 790 east_asian_width_changes, 791 numeric_changes)), 792 normalization_changes)) 793 794 795DATA_DIR = os.path.join('Tools', 'unicode', 'data') 796 797def open_data(template, version): 798 local = os.path.join(DATA_DIR, template % ('-'+version,)) 799 if not os.path.exists(local): 800 import urllib.request 801 if version == '3.2.0': 802 # irregular url structure 803 url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,) 804 else: 805 url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '') 806 os.makedirs(DATA_DIR, exist_ok=True) 807 urllib.request.urlretrieve(url, filename=local) 808 if local.endswith('.txt'): 809 return open(local, encoding='utf-8') 810 else: 811 # Unihan.zip 812 return open(local, 'rb') 813 814 815def expand_range(char_range: str) -> Iterator[int]: 816 ''' 817 Parses ranges of code points, as described in UAX #44: 818 https://www.unicode.org/reports/tr44/#Code_Point_Ranges 819 ''' 820 if '..' in char_range: 821 first, last = [int(c, 16) for c in char_range.split('..')] 822 else: 823 first = last = int(char_range, 16) 824 for char in range(first, last+1): 825 yield char 826 827 828class UcdFile: 829 ''' 830 A file in the standard format of the UCD. 831 832 See: https://www.unicode.org/reports/tr44/#Format_Conventions 833 834 Note that, as described there, the Unihan data files have their 835 own separate format. 836 ''' 837 838 def __init__(self, template: str, version: str) -> None: 839 self.template = template 840 self.version = version 841 842 def records(self) -> Iterator[List[str]]: 843 with open_data(self.template, self.version) as file: 844 for line in file: 845 line = line.split('#', 1)[0].strip() 846 if not line: 847 continue 848 yield [field.strip() for field in line.split(';')] 849 850 def __iter__(self) -> Iterator[List[str]]: 851 return self.records() 852 853 def expanded(self) -> Iterator[Tuple[int, List[str]]]: 854 for record in self.records(): 855 char_range, rest = record[0], record[1:] 856 for char in expand_range(char_range): 857 yield char, rest 858 859 860@dataclasses.dataclass 861class UcdRecord: 862 # 15 fields from UnicodeData.txt . See: 863 # https://www.unicode.org/reports/tr44/#UnicodeData.txt 864 codepoint: str 865 name: str 866 general_category: str 867 canonical_combining_class: str 868 bidi_class: str 869 decomposition_type: str 870 decomposition_mapping: str 871 numeric_type: str 872 numeric_value: str 873 bidi_mirrored: str 874 unicode_1_name: str # obsolete 875 iso_comment: str # obsolete 876 simple_uppercase_mapping: str 877 simple_lowercase_mapping: str 878 simple_titlecase_mapping: str 879 880 # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt 881 east_asian_width: Optional[str] 882 883 # Binary properties, as a set of those that are true. 884 # Taken from multiple files: 885 # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt 886 # https://www.unicode.org/reports/tr44/#LineBreak.txt 887 binary_properties: Set[str] 888 889 # The Quick_Check properties related to normalization: 890 # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization 891 # We store them as a bitmask. 892 quick_check: int 893 894 895def from_row(row: List[str]) -> UcdRecord: 896 return UcdRecord(*row, None, set(), 0) 897 898 899# -------------------------------------------------------------------- 900# the following support code is taken from the unidb utilities 901# Copyright (c) 1999-2000 by Secret Labs AB 902 903# load a unicode-data file from disk 904 905class UnicodeData: 906 # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned 907 908 def __init__(self, version, cjk_check=True): 909 self.changed = [] 910 table = [None] * 0x110000 911 for s in UcdFile(UNICODE_DATA, version): 912 char = int(s[0], 16) 913 table[char] = from_row(s) 914 915 cjk_ranges_found = [] 916 917 # expand first-last ranges 918 field = None 919 for i in range(0, 0x110000): 920 # The file UnicodeData.txt has its own distinct way of 921 # expressing ranges. See: 922 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges 923 s = table[i] 924 if s: 925 if s.name[-6:] == "First>": 926 s.name = "" 927 field = dataclasses.astuple(s)[:15] 928 elif s.name[-5:] == "Last>": 929 if s.name.startswith("<CJK Ideograph"): 930 cjk_ranges_found.append((field[0], 931 s.codepoint)) 932 s.name = "" 933 field = None 934 elif field: 935 table[i] = from_row(('%X' % i,) + field[1:]) 936 if cjk_check and cjk_ranges != cjk_ranges_found: 937 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) 938 939 # public attributes 940 self.filename = UNICODE_DATA % '' 941 self.table = table 942 self.chars = list(range(0x110000)) # unicode 3.2 943 944 # check for name aliases and named sequences, see #12753 945 # aliases and named sequences are not in 3.2.0 946 if version != '3.2.0': 947 self.aliases = [] 948 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, 949 # in order to take advantage of the compression and lookup 950 # algorithms used for the other characters 951 pua_index = NAME_ALIASES_START 952 for char, name, abbrev in UcdFile(NAME_ALIASES, version): 953 char = int(char, 16) 954 self.aliases.append((name, char)) 955 # also store the name in the PUA 1 956 self.table[pua_index].name = name 957 pua_index += 1 958 assert pua_index - NAME_ALIASES_START == len(self.aliases) 959 960 self.named_sequences = [] 961 # store named sequences in the PUA 1, in range U+F0100.., 962 # in order to take advantage of the compression and lookup 963 # algorithms used for the other characters. 964 965 assert pua_index < NAMED_SEQUENCES_START 966 pua_index = NAMED_SEQUENCES_START 967 for name, chars in UcdFile(NAMED_SEQUENCES, version): 968 chars = tuple(int(char, 16) for char in chars.split()) 969 # check that the structure defined in makeunicodename is OK 970 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" 971 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " 972 "the NamedSequence struct and in unicodedata_lookup") 973 self.named_sequences.append((name, chars)) 974 # also store these in the PUA 1 975 self.table[pua_index].name = name 976 pua_index += 1 977 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) 978 979 self.exclusions = {} 980 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version): 981 char = int(char, 16) 982 self.exclusions[char] = 1 983 984 widths = [None] * 0x110000 985 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded(): 986 widths[char] = width 987 988 for i in range(0, 0x110000): 989 if table[i] is not None: 990 table[i].east_asian_width = widths[i] 991 self.widths = widths 992 993 for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): 994 if propinfo: 995 # this is not a binary property, ignore it 996 continue 997 998 if table[char]: 999 # Some properties (e.g. Default_Ignorable_Code_Point) 1000 # apply to unassigned code points; ignore them 1001 table[char].binary_properties.add(propname) 1002 1003 for char_range, value in UcdFile(LINE_BREAK, version): 1004 if value not in MANDATORY_LINE_BREAKS: 1005 continue 1006 for char in expand_range(char_range): 1007 table[char].binary_properties.add('Line_Break') 1008 1009 # We only want the quickcheck properties 1010 # Format: NF?_QC; Y(es)/N(o)/M(aybe) 1011 # Yes is the default, hence only N and M occur 1012 # In 3.2.0, the format was different (NF?_NO) 1013 # The parsing will incorrectly determine these as 1014 # "yes", however, unicodedata.c will not perform quickchecks 1015 # for older versions, and no delta records will be created. 1016 quickchecks = [0] * 0x110000 1017 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 1018 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version): 1019 if len(s) < 2 or s[1] not in qc_order: 1020 continue 1021 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 1022 quickcheck_shift = qc_order.index(s[1])*2 1023 quickcheck <<= quickcheck_shift 1024 for char in expand_range(s[0]): 1025 assert not (quickchecks[char]>>quickcheck_shift)&3 1026 quickchecks[char] |= quickcheck 1027 for i in range(0, 0x110000): 1028 if table[i] is not None: 1029 table[i].quick_check = quickchecks[i] 1030 1031 with open_data(UNIHAN, version) as file: 1032 zip = zipfile.ZipFile(file) 1033 if version == '3.2.0': 1034 data = zip.open('Unihan-3.2.0.txt').read() 1035 else: 1036 data = zip.open('Unihan_NumericValues.txt').read() 1037 for line in data.decode("utf-8").splitlines(): 1038 if not line.startswith('U+'): 1039 continue 1040 code, tag, value = line.split(None, 3)[:3] 1041 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 1042 'kOtherNumeric'): 1043 continue 1044 value = value.strip().replace(',', '') 1045 i = int(code[2:], 16) 1046 # Patch the numeric field 1047 if table[i] is not None: 1048 table[i].numeric_value = value 1049 1050 sc = self.special_casing = {} 1051 for data in UcdFile(SPECIAL_CASING, version): 1052 if data[4]: 1053 # We ignore all conditionals (since they depend on 1054 # languages) except for one, which is hardcoded. See 1055 # handle_capital_sigma in unicodeobject.c. 1056 continue 1057 c = int(data[0], 16) 1058 lower = [int(char, 16) for char in data[1].split()] 1059 title = [int(char, 16) for char in data[2].split()] 1060 upper = [int(char, 16) for char in data[3].split()] 1061 sc[c] = (lower, title, upper) 1062 1063 cf = self.case_folding = {} 1064 if version != '3.2.0': 1065 for data in UcdFile(CASE_FOLDING, version): 1066 if data[1] in "CF": 1067 c = int(data[0], 16) 1068 cf[c] = [int(char, 16) for char in data[2].split()] 1069 1070 def uselatin1(self): 1071 # restrict character range to ISO Latin 1 1072 self.chars = list(range(256)) 1073 1074 1075 1076# stuff to deal with arrays of unsigned integers 1077 1078class Array: 1079 1080 def __init__(self, name, data): 1081 self.name = name 1082 self.data = data 1083 1084 def dump(self, file, trace=0): 1085 # write data to file, as a C array 1086 size = getsize(self.data) 1087 if trace: 1088 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr) 1089 file.write("static const ") 1090 if size == 1: 1091 file.write("unsigned char") 1092 elif size == 2: 1093 file.write("unsigned short") 1094 else: 1095 file.write("unsigned int") 1096 file.write(" " + self.name + "[] = {\n") 1097 if self.data: 1098 s = " " 1099 for item in self.data: 1100 i = str(item) + ", " 1101 if len(s) + len(i) > 78: 1102 file.write(s.rstrip() + "\n") 1103 s = " " + i 1104 else: 1105 s = s + i 1106 if s.strip(): 1107 file.write(s.rstrip() + "\n") 1108 file.write("};\n\n") 1109 1110 1111def getsize(data): 1112 # return smallest possible integer size for the given array 1113 maxdata = max(data) 1114 if maxdata < 256: 1115 return 1 1116 elif maxdata < 65536: 1117 return 2 1118 else: 1119 return 4 1120 1121 1122def splitbins(t, trace=0): 1123 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 1124 1125 t is a sequence of ints. This function can be useful to save space if 1126 many of the ints are the same. t1 and t2 are lists of ints, and shift 1127 is an int, chosen to minimize the combined size of t1 and t2 (in C 1128 code), and where for each i in range(len(t)), 1129 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1130 where mask is a bitmask isolating the last "shift" bits. 1131 1132 If optional arg trace is non-zero (default zero), progress info 1133 is printed to sys.stderr. The higher the value, the more info 1134 you'll get. 1135 """ 1136 1137 if trace: 1138 def dump(t1, t2, shift, bytes): 1139 print("%d+%d bins at shift %d; %d bytes" % ( 1140 len(t1), len(t2), shift, bytes), file=sys.stderr) 1141 print("Size of original table:", len(t)*getsize(t), "bytes", 1142 file=sys.stderr) 1143 n = len(t)-1 # last valid index 1144 maxshift = 0 # the most we can shift n and still have something left 1145 if n > 0: 1146 while n >> 1: 1147 n >>= 1 1148 maxshift += 1 1149 del n 1150 bytes = sys.maxsize # smallest total size so far 1151 t = tuple(t) # so slices can be dict keys 1152 for shift in range(maxshift + 1): 1153 t1 = [] 1154 t2 = [] 1155 size = 2**shift 1156 bincache = {} 1157 for i in range(0, len(t), size): 1158 bin = t[i:i+size] 1159 index = bincache.get(bin) 1160 if index is None: 1161 index = len(t2) 1162 bincache[bin] = index 1163 t2.extend(bin) 1164 t1.append(index >> shift) 1165 # determine memory size 1166 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 1167 if trace > 1: 1168 dump(t1, t2, shift, b) 1169 if b < bytes: 1170 best = t1, t2, shift 1171 bytes = b 1172 t1, t2, shift = best 1173 if trace: 1174 print("Best:", end=' ', file=sys.stderr) 1175 dump(t1, t2, shift, bytes) 1176 if __debug__: 1177 # exhaustively verify that the decomposition is correct 1178 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 1179 for i in range(len(t)): 1180 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1181 return best 1182 1183 1184if __name__ == "__main__": 1185 maketables(1) 1186