1# 2# (re)generate unicode property and type databases 3# 4# This script converts Unicode database files to Modules/unicodedata_db.h, 5# Modules/unicodename_db.h, and Objects/unicodetype_db.h 6# 7# history: 8# 2000-09-24 fl created (based on bits and pieces from unidb) 9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 10# 2000-09-25 fl added character type table 11# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 12# 2000-11-03 fl expand first/last ranges 13# 2001-01-19 fl added character name tables (2.1) 14# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 15# 2002-09-11 wd use string methods 16# 2002-10-18 mvl update to Unicode 3.2 17# 2002-10-22 mvl generate NFC tables 18# 2002-11-24 mvl expand all ranges, sort names version-independently 19# 2002-11-25 mvl add UNIDATA_VERSION 20# 2004-05-29 perky add east asian width information 21# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta 22# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch 23# 2011-10-21 ezio add support for name aliases and named sequences 24# 2012-01 benjamin add full case mappings 25# 26# written by Fredrik Lundh (fredrik@pythonware.com) 27# 28 29import dataclasses 30import os 31import sys 32import zipfile 33 34from functools import partial 35from textwrap import dedent 36from typing import Iterator, List, Optional, Set, Tuple 37 38SCRIPT = sys.argv[0] 39VERSION = "3.3" 40 41# The Unicode Database 42# -------------------- 43# When changing UCD version please update 44# * Doc/library/stdtypes.rst, and 45# * Doc/library/unicodedata.rst 46# * Doc/reference/lexical_analysis.rst (two occurrences) 47UNIDATA_VERSION = "13.0.0" 48UNICODE_DATA = "UnicodeData%s.txt" 49COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 50EASTASIAN_WIDTH = "EastAsianWidth%s.txt" 51UNIHAN = "Unihan%s.zip" 52DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" 53DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 54LINE_BREAK = "LineBreak%s.txt" 55NAME_ALIASES = "NameAliases%s.txt" 56NAMED_SEQUENCES = "NamedSequences%s.txt" 57SPECIAL_CASING = "SpecialCasing%s.txt" 58CASE_FOLDING = "CaseFolding%s.txt" 59 60# Private Use Areas -- in planes 1, 15, 16 61PUA_1 = range(0xE000, 0xF900) 62PUA_15 = range(0xF0000, 0xFFFFE) 63PUA_16 = range(0x100000, 0x10FFFE) 64 65# we use this ranges of PUA_15 to store name aliases and named sequences 66NAME_ALIASES_START = 0xF0000 67NAMED_SEQUENCES_START = 0xF0200 68 69old_versions = ["3.2.0"] 70 71CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 72 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 73 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 74 "So" ] 75 76BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 77 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 78 "ON", "LRI", "RLI", "FSI", "PDI" ] 79 80EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] 81 82MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 83 84# note: should match definitions in Objects/unicodectype.c 85ALPHA_MASK = 0x01 86DECIMAL_MASK = 0x02 87DIGIT_MASK = 0x04 88LOWER_MASK = 0x08 89LINEBREAK_MASK = 0x10 90SPACE_MASK = 0x20 91TITLE_MASK = 0x40 92UPPER_MASK = 0x80 93XID_START_MASK = 0x100 94XID_CONTINUE_MASK = 0x200 95PRINTABLE_MASK = 0x400 96NUMERIC_MASK = 0x800 97CASE_IGNORABLE_MASK = 0x1000 98CASED_MASK = 0x2000 99EXTENDED_CASE_MASK = 0x4000 100 101# these ranges need to match unicodedata.c:is_unified_ideograph 102cjk_ranges = [ 103 ('3400', '4DBF'), 104 ('4E00', '9FFC'), 105 ('20000', '2A6DD'), 106 ('2A700', '2B734'), 107 ('2B740', '2B81D'), 108 ('2B820', '2CEA1'), 109 ('2CEB0', '2EBE0'), 110 ('30000', '3134A'), 111] 112 113 114def maketables(trace=0): 115 116 print("--- Reading", UNICODE_DATA % "", "...") 117 118 unicode = UnicodeData(UNIDATA_VERSION) 119 120 print(len(list(filter(None, unicode.table))), "characters") 121 122 for version in old_versions: 123 print("--- Reading", UNICODE_DATA % ("-"+version), "...") 124 old_unicode = UnicodeData(version, cjk_check=False) 125 print(len(list(filter(None, old_unicode.table))), "characters") 126 merge_old_version(version, unicode, old_unicode) 127 128 makeunicodename(unicode, trace) 129 makeunicodedata(unicode, trace) 130 makeunicodetype(unicode, trace) 131 132 133# -------------------------------------------------------------------- 134# unicode character properties 135 136def makeunicodedata(unicode, trace): 137 138 dummy = (0, 0, 0, 0, 0, 0) 139 table = [dummy] 140 cache = {0: dummy} 141 index = [0] * len(unicode.chars) 142 143 FILE = "Modules/unicodedata_db.h" 144 145 print("--- Preparing", FILE, "...") 146 147 # 1) database properties 148 149 for char in unicode.chars: 150 record = unicode.table[char] 151 if record: 152 # extract database properties 153 category = CATEGORY_NAMES.index(record.general_category) 154 combining = int(record.canonical_combining_class) 155 bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class) 156 mirrored = record.bidi_mirrored == "Y" 157 eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) 158 normalizationquickcheck = record.quick_check 159 item = ( 160 category, combining, bidirectional, mirrored, eastasianwidth, 161 normalizationquickcheck 162 ) 163 # add entry to index and item tables 164 i = cache.get(item) 165 if i is None: 166 cache[item] = i = len(table) 167 table.append(item) 168 index[char] = i 169 170 # 2) decomposition data 171 172 decomp_data = [0] 173 decomp_prefix = [""] 174 decomp_index = [0] * len(unicode.chars) 175 decomp_size = 0 176 177 comp_pairs = [] 178 comp_first = [None] * len(unicode.chars) 179 comp_last = [None] * len(unicode.chars) 180 181 for char in unicode.chars: 182 record = unicode.table[char] 183 if record: 184 if record.decomposition_type: 185 decomp = record.decomposition_type.split() 186 if len(decomp) > 19: 187 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) 188 # prefix 189 if decomp[0][0] == "<": 190 prefix = decomp.pop(0) 191 else: 192 prefix = "" 193 try: 194 i = decomp_prefix.index(prefix) 195 except ValueError: 196 i = len(decomp_prefix) 197 decomp_prefix.append(prefix) 198 prefix = i 199 assert prefix < 256 200 # content 201 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 202 # Collect NFC pairs 203 if not prefix and len(decomp) == 3 and \ 204 char not in unicode.exclusions and \ 205 unicode.table[decomp[1]].canonical_combining_class == "0": 206 p, l, r = decomp 207 comp_first[l] = 1 208 comp_last[r] = 1 209 comp_pairs.append((l,r,char)) 210 try: 211 i = decomp_data.index(decomp) 212 except ValueError: 213 i = len(decomp_data) 214 decomp_data.extend(decomp) 215 decomp_size = decomp_size + len(decomp) * 2 216 else: 217 i = 0 218 decomp_index[char] = i 219 220 f = l = 0 221 comp_first_ranges = [] 222 comp_last_ranges = [] 223 prev_f = prev_l = None 224 for i in unicode.chars: 225 if comp_first[i] is not None: 226 comp_first[i] = f 227 f += 1 228 if prev_f is None: 229 prev_f = (i,i) 230 elif prev_f[1]+1 == i: 231 prev_f = prev_f[0],i 232 else: 233 comp_first_ranges.append(prev_f) 234 prev_f = (i,i) 235 if comp_last[i] is not None: 236 comp_last[i] = l 237 l += 1 238 if prev_l is None: 239 prev_l = (i,i) 240 elif prev_l[1]+1 == i: 241 prev_l = prev_l[0],i 242 else: 243 comp_last_ranges.append(prev_l) 244 prev_l = (i,i) 245 comp_first_ranges.append(prev_f) 246 comp_last_ranges.append(prev_l) 247 total_first = f 248 total_last = l 249 250 comp_data = [0]*(total_first*total_last) 251 for f,l,char in comp_pairs: 252 f = comp_first[f] 253 l = comp_last[l] 254 comp_data[f*total_last+l] = char 255 256 print(len(table), "unique properties") 257 print(len(decomp_prefix), "unique decomposition prefixes") 258 print(len(decomp_data), "unique decomposition entries:", end=' ') 259 print(decomp_size, "bytes") 260 print(total_first, "first characters in NFC") 261 print(total_last, "last characters in NFC") 262 print(len(comp_pairs), "NFC pairs") 263 264 print("--- Writing", FILE, "...") 265 266 with open(FILE, "w") as fp: 267 fprint = partial(print, file=fp) 268 269 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 270 fprint() 271 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION) 272 fprint("/* a list of unique database records */") 273 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {") 274 for item in table: 275 fprint(" {%d, %d, %d, %d, %d, %d}," % item) 276 fprint("};") 277 fprint() 278 279 fprint("/* Reindexing of NFC first characters. */") 280 fprint("#define TOTAL_FIRST",total_first) 281 fprint("#define TOTAL_LAST",total_last) 282 fprint("struct reindex{int start;short count,index;};") 283 fprint("static struct reindex nfc_first[] = {") 284 for start,end in comp_first_ranges: 285 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start])) 286 fprint(" {0,0,0}") 287 fprint("};\n") 288 fprint("static struct reindex nfc_last[] = {") 289 for start,end in comp_last_ranges: 290 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start])) 291 fprint(" {0,0,0}") 292 fprint("};\n") 293 294 # FIXME: <fl> the following tables could be made static, and 295 # the support code moved into unicodedatabase.c 296 297 fprint("/* string literals */") 298 fprint("const char *_PyUnicode_CategoryNames[] = {") 299 for name in CATEGORY_NAMES: 300 fprint(" \"%s\"," % name) 301 fprint(" NULL") 302 fprint("};") 303 304 fprint("const char *_PyUnicode_BidirectionalNames[] = {") 305 for name in BIDIRECTIONAL_NAMES: 306 fprint(" \"%s\"," % name) 307 fprint(" NULL") 308 fprint("};") 309 310 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {") 311 for name in EASTASIANWIDTH_NAMES: 312 fprint(" \"%s\"," % name) 313 fprint(" NULL") 314 fprint("};") 315 316 fprint("static const char *decomp_prefix[] = {") 317 for name in decomp_prefix: 318 fprint(" \"%s\"," % name) 319 fprint(" NULL") 320 fprint("};") 321 322 # split record index table 323 index1, index2, shift = splitbins(index, trace) 324 325 fprint("/* index tables for the database records */") 326 fprint("#define SHIFT", shift) 327 Array("index1", index1).dump(fp, trace) 328 Array("index2", index2).dump(fp, trace) 329 330 # split decomposition index table 331 index1, index2, shift = splitbins(decomp_index, trace) 332 333 fprint("/* decomposition data */") 334 Array("decomp_data", decomp_data).dump(fp, trace) 335 336 fprint("/* index tables for the decomposition data */") 337 fprint("#define DECOMP_SHIFT", shift) 338 Array("decomp_index1", index1).dump(fp, trace) 339 Array("decomp_index2", index2).dump(fp, trace) 340 341 index, index2, shift = splitbins(comp_data, trace) 342 fprint("/* NFC pairs */") 343 fprint("#define COMP_SHIFT", shift) 344 Array("comp_index", index).dump(fp, trace) 345 Array("comp_data", index2).dump(fp, trace) 346 347 # Generate delta tables for old versions 348 for version, table, normalization in unicode.changed: 349 cversion = version.replace(".","_") 350 records = [table[0]] 351 cache = {table[0]:0} 352 index = [0] * len(table) 353 for i, record in enumerate(table): 354 try: 355 index[i] = cache[record] 356 except KeyError: 357 index[i] = cache[record] = len(records) 358 records.append(record) 359 index1, index2, shift = splitbins(index, trace) 360 fprint("static const change_record change_records_%s[] = {" % cversion) 361 for record in records: 362 fprint(" { %s }," % ", ".join(map(str,record))) 363 fprint("};") 364 Array("changes_%s_index" % cversion, index1).dump(fp, trace) 365 Array("changes_%s_data" % cversion, index2).dump(fp, trace) 366 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion) 367 fprint("{") 368 fprint(" int index;") 369 fprint(" if (n >= 0x110000) index = 0;") 370 fprint(" else {") 371 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift)) 372 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \ 373 (cversion, shift, ((1<<shift)-1))) 374 fprint(" }") 375 fprint(" return change_records_%s+index;" % cversion) 376 fprint("}\n") 377 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion) 378 fprint("{") 379 fprint(" switch(n) {") 380 for k, v in normalization: 381 fprint(" case %s: return 0x%s;" % (hex(k), v)) 382 fprint(" default: return 0;") 383 fprint(" }\n}\n") 384 385 386# -------------------------------------------------------------------- 387# unicode character type tables 388 389def makeunicodetype(unicode, trace): 390 391 FILE = "Objects/unicodetype_db.h" 392 393 print("--- Preparing", FILE, "...") 394 395 # extract unicode types 396 dummy = (0, 0, 0, 0, 0, 0) 397 table = [dummy] 398 cache = {0: dummy} 399 index = [0] * len(unicode.chars) 400 numeric = {} 401 spaces = [] 402 linebreaks = [] 403 extra_casing = [] 404 405 for char in unicode.chars: 406 record = unicode.table[char] 407 if record: 408 # extract database properties 409 category = record.general_category 410 bidirectional = record.bidi_class 411 properties = record.binary_properties 412 flags = 0 413 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 414 flags |= ALPHA_MASK 415 if "Lowercase" in properties: 416 flags |= LOWER_MASK 417 if 'Line_Break' in properties or bidirectional == "B": 418 flags |= LINEBREAK_MASK 419 linebreaks.append(char) 420 if category == "Zs" or bidirectional in ("WS", "B", "S"): 421 flags |= SPACE_MASK 422 spaces.append(char) 423 if category == "Lt": 424 flags |= TITLE_MASK 425 if "Uppercase" in properties: 426 flags |= UPPER_MASK 427 if char == ord(" ") or category[0] not in ("C", "Z"): 428 flags |= PRINTABLE_MASK 429 if "XID_Start" in properties: 430 flags |= XID_START_MASK 431 if "XID_Continue" in properties: 432 flags |= XID_CONTINUE_MASK 433 if "Cased" in properties: 434 flags |= CASED_MASK 435 if "Case_Ignorable" in properties: 436 flags |= CASE_IGNORABLE_MASK 437 sc = unicode.special_casing.get(char) 438 cf = unicode.case_folding.get(char, [char]) 439 if record.simple_uppercase_mapping: 440 upper = int(record.simple_uppercase_mapping, 16) 441 else: 442 upper = char 443 if record.simple_lowercase_mapping: 444 lower = int(record.simple_lowercase_mapping, 16) 445 else: 446 lower = char 447 if record.simple_titlecase_mapping: 448 title = int(record.simple_titlecase_mapping, 16) 449 else: 450 title = upper 451 if sc is None and cf != [lower]: 452 sc = ([lower], [title], [upper]) 453 if sc is None: 454 if upper == lower == title: 455 upper = lower = title = 0 456 else: 457 upper = upper - char 458 lower = lower - char 459 title = title - char 460 assert (abs(upper) <= 2147483647 and 461 abs(lower) <= 2147483647 and 462 abs(title) <= 2147483647) 463 else: 464 # This happens either when some character maps to more than one 465 # character in uppercase, lowercase, or titlecase or the 466 # casefolded version of the character is different from the 467 # lowercase. The extra characters are stored in a different 468 # array. 469 flags |= EXTENDED_CASE_MASK 470 lower = len(extra_casing) | (len(sc[0]) << 24) 471 extra_casing.extend(sc[0]) 472 if cf != sc[0]: 473 lower |= len(cf) << 20 474 extra_casing.extend(cf) 475 upper = len(extra_casing) | (len(sc[2]) << 24) 476 extra_casing.extend(sc[2]) 477 # Title is probably equal to upper. 478 if sc[1] == sc[2]: 479 title = upper 480 else: 481 title = len(extra_casing) | (len(sc[1]) << 24) 482 extra_casing.extend(sc[1]) 483 # decimal digit, integer digit 484 decimal = 0 485 if record.decomposition_mapping: 486 flags |= DECIMAL_MASK 487 decimal = int(record.decomposition_mapping) 488 digit = 0 489 if record.numeric_type: 490 flags |= DIGIT_MASK 491 digit = int(record.numeric_type) 492 if record.numeric_value: 493 flags |= NUMERIC_MASK 494 numeric.setdefault(record.numeric_value, []).append(char) 495 item = ( 496 upper, lower, title, decimal, digit, flags 497 ) 498 # add entry to index and item tables 499 i = cache.get(item) 500 if i is None: 501 cache[item] = i = len(table) 502 table.append(item) 503 index[char] = i 504 505 print(len(table), "unique character type entries") 506 print(sum(map(len, numeric.values())), "numeric code points") 507 print(len(spaces), "whitespace code points") 508 print(len(linebreaks), "linebreak code points") 509 print(len(extra_casing), "extended case array") 510 511 print("--- Writing", FILE, "...") 512 513 with open(FILE, "w") as fp: 514 fprint = partial(print, file=fp) 515 516 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 517 fprint() 518 fprint("/* a list of unique character type descriptors */") 519 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {") 520 for item in table: 521 fprint(" {%d, %d, %d, %d, %d, %d}," % item) 522 fprint("};") 523 fprint() 524 525 fprint("/* extended case mappings */") 526 fprint() 527 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {") 528 for c in extra_casing: 529 fprint(" %d," % c) 530 fprint("};") 531 fprint() 532 533 # split decomposition index table 534 index1, index2, shift = splitbins(index, trace) 535 536 fprint("/* type indexes */") 537 fprint("#define SHIFT", shift) 538 Array("index1", index1).dump(fp, trace) 539 Array("index2", index2).dump(fp, trace) 540 541 # Generate code for _PyUnicode_ToNumeric() 542 numeric_items = sorted(numeric.items()) 543 fprint('/* Returns the numeric value as double for Unicode characters') 544 fprint(' * having this property, -1.0 otherwise.') 545 fprint(' */') 546 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)') 547 fprint('{') 548 fprint(' switch (ch) {') 549 for value, codepoints in numeric_items: 550 # Turn text into float literals 551 parts = value.split('/') 552 parts = [repr(float(part)) for part in parts] 553 value = '/'.join(parts) 554 555 codepoints.sort() 556 for codepoint in codepoints: 557 fprint(' case 0x%04X:' % (codepoint,)) 558 fprint(' return (double) %s;' % (value,)) 559 fprint(' }') 560 fprint(' return -1.0;') 561 fprint('}') 562 fprint() 563 564 # Generate code for _PyUnicode_IsWhitespace() 565 fprint("/* Returns 1 for Unicode characters having the bidirectional") 566 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.") 567 fprint(" */") 568 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)') 569 fprint('{') 570 fprint(' switch (ch) {') 571 572 for codepoint in sorted(spaces): 573 fprint(' case 0x%04X:' % (codepoint,)) 574 fprint(' return 1;') 575 576 fprint(' }') 577 fprint(' return 0;') 578 fprint('}') 579 fprint() 580 581 # Generate code for _PyUnicode_IsLinebreak() 582 fprint("/* Returns 1 for Unicode characters having the line break") 583 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") 584 fprint(" * type 'B', 0 otherwise.") 585 fprint(" */") 586 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') 587 fprint('{') 588 fprint(' switch (ch) {') 589 for codepoint in sorted(linebreaks): 590 fprint(' case 0x%04X:' % (codepoint,)) 591 fprint(' return 1;') 592 593 fprint(' }') 594 fprint(' return 0;') 595 fprint('}') 596 fprint() 597 598 599# -------------------------------------------------------------------- 600# unicode name database 601 602def makeunicodename(unicode, trace): 603 604 FILE = "Modules/unicodename_db.h" 605 606 print("--- Preparing", FILE, "...") 607 608 # collect names 609 names = [None] * len(unicode.chars) 610 611 for char in unicode.chars: 612 record = unicode.table[char] 613 if record: 614 name = record.name.strip() 615 if name and name[0] != "<": 616 names[char] = name + chr(0) 617 618 print(len([n for n in names if n is not None]), "distinct names") 619 620 # collect unique words from names (note that we differ between 621 # words inside a sentence, and words ending a sentence. the 622 # latter includes the trailing null byte. 623 624 words = {} 625 n = b = 0 626 for char in unicode.chars: 627 name = names[char] 628 if name: 629 w = name.split() 630 b = b + len(name) 631 n = n + len(w) 632 for w in w: 633 l = words.get(w) 634 if l: 635 l.append(None) 636 else: 637 words[w] = [len(words)] 638 639 print(n, "words in text;", b, "bytes") 640 641 wordlist = list(words.items()) 642 643 # sort on falling frequency, then by name 644 def word_key(a): 645 aword, alist = a 646 return -len(alist), aword 647 wordlist.sort(key=word_key) 648 649 # figure out how many phrasebook escapes we need 650 escapes = 0 651 while escapes * 256 < len(wordlist): 652 escapes = escapes + 1 653 print(escapes, "escapes") 654 655 short = 256 - escapes 656 657 assert short > 0 658 659 print(short, "short indexes in lexicon") 660 661 # statistics 662 n = 0 663 for i in range(short): 664 n = n + len(wordlist[i][1]) 665 print(n, "short indexes in phrasebook") 666 667 # pick the most commonly used words, and sort the rest on falling 668 # length (to maximize overlap) 669 670 wordlist, wordtail = wordlist[:short], wordlist[short:] 671 wordtail.sort(key=lambda a: a[0], reverse=True) 672 wordlist.extend(wordtail) 673 674 # generate lexicon from words 675 676 lexicon_offset = [0] 677 lexicon = "" 678 words = {} 679 680 # build a lexicon string 681 offset = 0 682 for w, x in wordlist: 683 # encoding: bit 7 indicates last character in word (chr(128) 684 # indicates the last character in an entire string) 685 ww = w[:-1] + chr(ord(w[-1])+128) 686 # reuse string tails, when possible 687 o = lexicon.find(ww) 688 if o < 0: 689 o = offset 690 lexicon = lexicon + ww 691 offset = offset + len(w) 692 words[w] = len(lexicon_offset) 693 lexicon_offset.append(o) 694 695 lexicon = list(map(ord, lexicon)) 696 697 # generate phrasebook from names and lexicon 698 phrasebook = [0] 699 phrasebook_offset = [0] * len(unicode.chars) 700 for char in unicode.chars: 701 name = names[char] 702 if name: 703 w = name.split() 704 phrasebook_offset[char] = len(phrasebook) 705 for w in w: 706 i = words[w] 707 if i < short: 708 phrasebook.append(i) 709 else: 710 # store as two bytes 711 phrasebook.append((i>>8) + short) 712 phrasebook.append(i&255) 713 714 assert getsize(phrasebook) == 1 715 716 # 717 # unicode name hash table 718 719 # extract names 720 data = [] 721 for char in unicode.chars: 722 record = unicode.table[char] 723 if record: 724 name = record.name.strip() 725 if name and name[0] != "<": 726 data.append((name, char)) 727 728 # the magic number 47 was chosen to minimize the number of 729 # collisions on the current data set. if you like, change it 730 # and see what happens... 731 732 codehash = Hash("code", data, 47) 733 734 print("--- Writing", FILE, "...") 735 736 with open(FILE, "w") as fp: 737 fprint = partial(print, file=fp) 738 739 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 740 fprint() 741 fprint("#define NAME_MAXLEN", 256) 742 fprint() 743 fprint("/* lexicon */") 744 Array("lexicon", lexicon).dump(fp, trace) 745 Array("lexicon_offset", lexicon_offset).dump(fp, trace) 746 747 # split decomposition index table 748 offset1, offset2, shift = splitbins(phrasebook_offset, trace) 749 750 fprint("/* code->name phrasebook */") 751 fprint("#define phrasebook_shift", shift) 752 fprint("#define phrasebook_short", short) 753 754 Array("phrasebook", phrasebook).dump(fp, trace) 755 Array("phrasebook_offset1", offset1).dump(fp, trace) 756 Array("phrasebook_offset2", offset2).dump(fp, trace) 757 758 fprint("/* name->code dictionary */") 759 codehash.dump(fp, trace) 760 761 fprint() 762 fprint('static const unsigned int aliases_start = %#x;' % 763 NAME_ALIASES_START) 764 fprint('static const unsigned int aliases_end = %#x;' % 765 (NAME_ALIASES_START + len(unicode.aliases))) 766 767 fprint('static const unsigned int name_aliases[] = {') 768 for name, codepoint in unicode.aliases: 769 fprint(' 0x%04X,' % codepoint) 770 fprint('};') 771 772 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, 773 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer 774 # sequences or sequences with non-BMP chars are added. 775 # unicodedata_lookup should be adapted too. 776 fprint(dedent(""" 777 typedef struct NamedSequence { 778 int seqlen; 779 Py_UCS2 seq[4]; 780 } named_sequence; 781 """)) 782 783 fprint('static const unsigned int named_sequences_start = %#x;' % 784 NAMED_SEQUENCES_START) 785 fprint('static const unsigned int named_sequences_end = %#x;' % 786 (NAMED_SEQUENCES_START + len(unicode.named_sequences))) 787 788 fprint('static const named_sequence named_sequences[] = {') 789 for name, sequence in unicode.named_sequences: 790 seq_str = ', '.join('0x%04X' % cp for cp in sequence) 791 fprint(' {%d, {%s}},' % (len(sequence), seq_str)) 792 fprint('};') 793 794 795def merge_old_version(version, new, old): 796 # Changes to exclusion file not implemented yet 797 if old.exclusions != new.exclusions: 798 raise NotImplementedError("exclusions differ") 799 800 # In these change records, 0xFF means "no change" 801 bidir_changes = [0xFF]*0x110000 802 category_changes = [0xFF]*0x110000 803 decimal_changes = [0xFF]*0x110000 804 mirrored_changes = [0xFF]*0x110000 805 east_asian_width_changes = [0xFF]*0x110000 806 # In numeric data, 0 means "no change", 807 # -1 means "did not have a numeric value 808 numeric_changes = [0] * 0x110000 809 # normalization_changes is a list of key-value pairs 810 normalization_changes = [] 811 for i in range(0x110000): 812 if new.table[i] is None: 813 # Characters unassigned in the new version ought to 814 # be unassigned in the old one 815 assert old.table[i] is None 816 continue 817 # check characters unassigned in the old version 818 if old.table[i] is None: 819 # category 0 is "unassigned" 820 category_changes[i] = 0 821 continue 822 # check characters that differ 823 if old.table[i] != new.table[i]: 824 for k, field in enumerate(dataclasses.fields(UcdRecord)): 825 value = getattr(old.table[i], field.name) 826 new_value = getattr(new.table[i], field.name) 827 if value != new_value: 828 if k == 1 and i in PUA_15: 829 # the name is not set in the old.table, but in the 830 # new.table we are using it for aliases and named seq 831 assert value == '' 832 elif k == 2: 833 category_changes[i] = CATEGORY_NAMES.index(value) 834 elif k == 4: 835 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) 836 elif k == 5: 837 # We assume that all normalization changes are in 1:1 mappings 838 assert " " not in value 839 normalization_changes.append((i, value)) 840 elif k == 6: 841 # we only support changes where the old value is a single digit 842 assert value in "0123456789" 843 decimal_changes[i] = int(value) 844 elif k == 8: 845 # Since 0 encodes "no change", the old value is better not 0 846 if not value: 847 numeric_changes[i] = -1 848 else: 849 numeric_changes[i] = float(value) 850 assert numeric_changes[i] not in (0, -1) 851 elif k == 9: 852 if value == 'Y': 853 mirrored_changes[i] = '1' 854 else: 855 mirrored_changes[i] = '0' 856 elif k == 11: 857 # change to ISO comment, ignore 858 pass 859 elif k == 12: 860 # change to simple uppercase mapping; ignore 861 pass 862 elif k == 13: 863 # change to simple lowercase mapping; ignore 864 pass 865 elif k == 14: 866 # change to simple titlecase mapping; ignore 867 pass 868 elif k == 15: 869 # change to east asian width 870 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value) 871 elif k == 16: 872 # derived property changes; not yet 873 pass 874 elif k == 17: 875 # normalization quickchecks are not performed 876 # for older versions 877 pass 878 else: 879 class Difference(Exception):pass 880 raise Difference(hex(i), k, old.table[i], new.table[i]) 881 new.changed.append((version, list(zip(bidir_changes, category_changes, 882 decimal_changes, mirrored_changes, 883 east_asian_width_changes, 884 numeric_changes)), 885 normalization_changes)) 886 887 888DATA_DIR = os.path.join('Tools', 'unicode', 'data') 889 890def open_data(template, version): 891 local = os.path.join(DATA_DIR, template % ('-'+version,)) 892 if not os.path.exists(local): 893 import urllib.request 894 if version == '3.2.0': 895 # irregular url structure 896 url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,) 897 else: 898 url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '') 899 os.makedirs(DATA_DIR, exist_ok=True) 900 urllib.request.urlretrieve(url, filename=local) 901 if local.endswith('.txt'): 902 return open(local, encoding='utf-8') 903 else: 904 # Unihan.zip 905 return open(local, 'rb') 906 907 908def expand_range(char_range: str) -> Iterator[int]: 909 ''' 910 Parses ranges of code points, as described in UAX #44: 911 https://www.unicode.org/reports/tr44/#Code_Point_Ranges 912 ''' 913 if '..' in char_range: 914 first, last = [int(c, 16) for c in char_range.split('..')] 915 else: 916 first = last = int(char_range, 16) 917 for char in range(first, last+1): 918 yield char 919 920 921class UcdFile: 922 ''' 923 A file in the standard format of the UCD. 924 925 See: https://www.unicode.org/reports/tr44/#Format_Conventions 926 927 Note that, as described there, the Unihan data files have their 928 own separate format. 929 ''' 930 931 def __init__(self, template: str, version: str) -> None: 932 self.template = template 933 self.version = version 934 935 def records(self) -> Iterator[List[str]]: 936 with open_data(self.template, self.version) as file: 937 for line in file: 938 line = line.split('#', 1)[0].strip() 939 if not line: 940 continue 941 yield [field.strip() for field in line.split(';')] 942 943 def __iter__(self) -> Iterator[List[str]]: 944 return self.records() 945 946 def expanded(self) -> Iterator[Tuple[int, List[str]]]: 947 for record in self.records(): 948 char_range, rest = record[0], record[1:] 949 for char in expand_range(char_range): 950 yield char, rest 951 952 953@dataclasses.dataclass 954class UcdRecord: 955 # 15 fields from UnicodeData.txt . See: 956 # https://www.unicode.org/reports/tr44/#UnicodeData.txt 957 codepoint: str 958 name: str 959 general_category: str 960 canonical_combining_class: str 961 bidi_class: str 962 decomposition_type: str 963 decomposition_mapping: str 964 numeric_type: str 965 numeric_value: str 966 bidi_mirrored: str 967 unicode_1_name: str # obsolete 968 iso_comment: str # obsolete 969 simple_uppercase_mapping: str 970 simple_lowercase_mapping: str 971 simple_titlecase_mapping: str 972 973 # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt 974 east_asian_width: Optional[str] 975 976 # Binary properties, as a set of those that are true. 977 # Taken from multiple files: 978 # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt 979 # https://www.unicode.org/reports/tr44/#LineBreak.txt 980 binary_properties: Set[str] 981 982 # The Quick_Check properties related to normalization: 983 # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization 984 # We store them as a bitmask. 985 quick_check: int 986 987 988def from_row(row: List[str]) -> UcdRecord: 989 return UcdRecord(*row, None, set(), 0) 990 991 992# -------------------------------------------------------------------- 993# the following support code is taken from the unidb utilities 994# Copyright (c) 1999-2000 by Secret Labs AB 995 996# load a unicode-data file from disk 997 998class UnicodeData: 999 # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned 1000 1001 def __init__(self, version, cjk_check=True): 1002 self.changed = [] 1003 table = [None] * 0x110000 1004 for s in UcdFile(UNICODE_DATA, version): 1005 char = int(s[0], 16) 1006 table[char] = from_row(s) 1007 1008 cjk_ranges_found = [] 1009 1010 # expand first-last ranges 1011 field = None 1012 for i in range(0, 0x110000): 1013 # The file UnicodeData.txt has its own distinct way of 1014 # expressing ranges. See: 1015 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges 1016 s = table[i] 1017 if s: 1018 if s.name[-6:] == "First>": 1019 s.name = "" 1020 field = dataclasses.astuple(s)[:15] 1021 elif s.name[-5:] == "Last>": 1022 if s.name.startswith("<CJK Ideograph"): 1023 cjk_ranges_found.append((field[0], 1024 s.codepoint)) 1025 s.name = "" 1026 field = None 1027 elif field: 1028 table[i] = from_row(('%X' % i,) + field[1:]) 1029 if cjk_check and cjk_ranges != cjk_ranges_found: 1030 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) 1031 1032 # public attributes 1033 self.filename = UNICODE_DATA % '' 1034 self.table = table 1035 self.chars = list(range(0x110000)) # unicode 3.2 1036 1037 # check for name aliases and named sequences, see #12753 1038 # aliases and named sequences are not in 3.2.0 1039 if version != '3.2.0': 1040 self.aliases = [] 1041 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, 1042 # in order to take advantage of the compression and lookup 1043 # algorithms used for the other characters 1044 pua_index = NAME_ALIASES_START 1045 for char, name, abbrev in UcdFile(NAME_ALIASES, version): 1046 char = int(char, 16) 1047 self.aliases.append((name, char)) 1048 # also store the name in the PUA 1 1049 self.table[pua_index].name = name 1050 pua_index += 1 1051 assert pua_index - NAME_ALIASES_START == len(self.aliases) 1052 1053 self.named_sequences = [] 1054 # store named sequences in the PUA 1, in range U+F0100.., 1055 # in order to take advantage of the compression and lookup 1056 # algorithms used for the other characters. 1057 1058 assert pua_index < NAMED_SEQUENCES_START 1059 pua_index = NAMED_SEQUENCES_START 1060 for name, chars in UcdFile(NAMED_SEQUENCES, version): 1061 chars = tuple(int(char, 16) for char in chars.split()) 1062 # check that the structure defined in makeunicodename is OK 1063 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" 1064 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " 1065 "the NamedSequence struct and in unicodedata_lookup") 1066 self.named_sequences.append((name, chars)) 1067 # also store these in the PUA 1 1068 self.table[pua_index].name = name 1069 pua_index += 1 1070 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) 1071 1072 self.exclusions = {} 1073 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version): 1074 char = int(char, 16) 1075 self.exclusions[char] = 1 1076 1077 widths = [None] * 0x110000 1078 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded(): 1079 widths[char] = width 1080 1081 for i in range(0, 0x110000): 1082 if table[i] is not None: 1083 table[i].east_asian_width = widths[i] 1084 1085 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): 1086 if table[char]: 1087 # Some properties (e.g. Default_Ignorable_Code_Point) 1088 # apply to unassigned code points; ignore them 1089 table[char].binary_properties.add(p) 1090 1091 for char_range, value in UcdFile(LINE_BREAK, version): 1092 if value not in MANDATORY_LINE_BREAKS: 1093 continue 1094 for char in expand_range(char_range): 1095 table[char].binary_properties.add('Line_Break') 1096 1097 # We only want the quickcheck properties 1098 # Format: NF?_QC; Y(es)/N(o)/M(aybe) 1099 # Yes is the default, hence only N and M occur 1100 # In 3.2.0, the format was different (NF?_NO) 1101 # The parsing will incorrectly determine these as 1102 # "yes", however, unicodedata.c will not perform quickchecks 1103 # for older versions, and no delta records will be created. 1104 quickchecks = [0] * 0x110000 1105 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 1106 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version): 1107 if len(s) < 2 or s[1] not in qc_order: 1108 continue 1109 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 1110 quickcheck_shift = qc_order.index(s[1])*2 1111 quickcheck <<= quickcheck_shift 1112 for char in expand_range(s[0]): 1113 assert not (quickchecks[char]>>quickcheck_shift)&3 1114 quickchecks[char] |= quickcheck 1115 for i in range(0, 0x110000): 1116 if table[i] is not None: 1117 table[i].quick_check = quickchecks[i] 1118 1119 with open_data(UNIHAN, version) as file: 1120 zip = zipfile.ZipFile(file) 1121 if version == '3.2.0': 1122 data = zip.open('Unihan-3.2.0.txt').read() 1123 else: 1124 data = zip.open('Unihan_NumericValues.txt').read() 1125 for line in data.decode("utf-8").splitlines(): 1126 if not line.startswith('U+'): 1127 continue 1128 code, tag, value = line.split(None, 3)[:3] 1129 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 1130 'kOtherNumeric'): 1131 continue 1132 value = value.strip().replace(',', '') 1133 i = int(code[2:], 16) 1134 # Patch the numeric field 1135 if table[i] is not None: 1136 table[i].numeric_value = value 1137 1138 sc = self.special_casing = {} 1139 for data in UcdFile(SPECIAL_CASING, version): 1140 if data[4]: 1141 # We ignore all conditionals (since they depend on 1142 # languages) except for one, which is hardcoded. See 1143 # handle_capital_sigma in unicodeobject.c. 1144 continue 1145 c = int(data[0], 16) 1146 lower = [int(char, 16) for char in data[1].split()] 1147 title = [int(char, 16) for char in data[2].split()] 1148 upper = [int(char, 16) for char in data[3].split()] 1149 sc[c] = (lower, title, upper) 1150 1151 cf = self.case_folding = {} 1152 if version != '3.2.0': 1153 for data in UcdFile(CASE_FOLDING, version): 1154 if data[1] in "CF": 1155 c = int(data[0], 16) 1156 cf[c] = [int(char, 16) for char in data[2].split()] 1157 1158 def uselatin1(self): 1159 # restrict character range to ISO Latin 1 1160 self.chars = list(range(256)) 1161 1162 1163# hash table tools 1164 1165# this is a straight-forward reimplementation of Python's built-in 1166# dictionary type, using a static data structure, and a custom string 1167# hash algorithm. 1168 1169def myhash(s, magic): 1170 h = 0 1171 for c in map(ord, s.upper()): 1172 h = (h * magic) + c 1173 ix = h & 0xff000000 1174 if ix: 1175 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff 1176 return h 1177 1178 1179SIZES = [ 1180 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), 1181 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), 1182 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), 1183 (2097152,5), (4194304,3), (8388608,33), (16777216,27) 1184] 1185 1186 1187class Hash: 1188 def __init__(self, name, data, magic): 1189 # turn a (key, value) list into a static hash table structure 1190 1191 # determine table size 1192 for size, poly in SIZES: 1193 if size > len(data): 1194 poly = size + poly 1195 break 1196 else: 1197 raise AssertionError("ran out of polynomials") 1198 1199 print(size, "slots in hash table") 1200 1201 table = [None] * size 1202 1203 mask = size-1 1204 1205 n = 0 1206 1207 hash = myhash 1208 1209 # initialize hash table 1210 for key, value in data: 1211 h = hash(key, magic) 1212 i = (~h) & mask 1213 v = table[i] 1214 if v is None: 1215 table[i] = value 1216 continue 1217 incr = (h ^ (h >> 3)) & mask 1218 if not incr: 1219 incr = mask 1220 while 1: 1221 n = n + 1 1222 i = (i + incr) & mask 1223 v = table[i] 1224 if v is None: 1225 table[i] = value 1226 break 1227 incr = incr << 1 1228 if incr > mask: 1229 incr = incr ^ poly 1230 1231 print(n, "collisions") 1232 self.collisions = n 1233 1234 for i in range(len(table)): 1235 if table[i] is None: 1236 table[i] = 0 1237 1238 self.data = Array(name + "_hash", table) 1239 self.magic = magic 1240 self.name = name 1241 self.size = size 1242 self.poly = poly 1243 1244 def dump(self, file, trace): 1245 # write data to file, as a C array 1246 self.data.dump(file, trace) 1247 file.write("#define %s_magic %d\n" % (self.name, self.magic)) 1248 file.write("#define %s_size %d\n" % (self.name, self.size)) 1249 file.write("#define %s_poly %d\n" % (self.name, self.poly)) 1250 1251 1252# stuff to deal with arrays of unsigned integers 1253 1254class Array: 1255 1256 def __init__(self, name, data): 1257 self.name = name 1258 self.data = data 1259 1260 def dump(self, file, trace=0): 1261 # write data to file, as a C array 1262 size = getsize(self.data) 1263 if trace: 1264 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr) 1265 file.write("static const ") 1266 if size == 1: 1267 file.write("unsigned char") 1268 elif size == 2: 1269 file.write("unsigned short") 1270 else: 1271 file.write("unsigned int") 1272 file.write(" " + self.name + "[] = {\n") 1273 if self.data: 1274 s = " " 1275 for item in self.data: 1276 i = str(item) + ", " 1277 if len(s) + len(i) > 78: 1278 file.write(s.rstrip() + "\n") 1279 s = " " + i 1280 else: 1281 s = s + i 1282 if s.strip(): 1283 file.write(s.rstrip() + "\n") 1284 file.write("};\n\n") 1285 1286 1287def getsize(data): 1288 # return smallest possible integer size for the given array 1289 maxdata = max(data) 1290 if maxdata < 256: 1291 return 1 1292 elif maxdata < 65536: 1293 return 2 1294 else: 1295 return 4 1296 1297 1298def splitbins(t, trace=0): 1299 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 1300 1301 t is a sequence of ints. This function can be useful to save space if 1302 many of the ints are the same. t1 and t2 are lists of ints, and shift 1303 is an int, chosen to minimize the combined size of t1 and t2 (in C 1304 code), and where for each i in range(len(t)), 1305 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1306 where mask is a bitmask isolating the last "shift" bits. 1307 1308 If optional arg trace is non-zero (default zero), progress info 1309 is printed to sys.stderr. The higher the value, the more info 1310 you'll get. 1311 """ 1312 1313 if trace: 1314 def dump(t1, t2, shift, bytes): 1315 print("%d+%d bins at shift %d; %d bytes" % ( 1316 len(t1), len(t2), shift, bytes), file=sys.stderr) 1317 print("Size of original table:", len(t)*getsize(t), "bytes", 1318 file=sys.stderr) 1319 n = len(t)-1 # last valid index 1320 maxshift = 0 # the most we can shift n and still have something left 1321 if n > 0: 1322 while n >> 1: 1323 n >>= 1 1324 maxshift += 1 1325 del n 1326 bytes = sys.maxsize # smallest total size so far 1327 t = tuple(t) # so slices can be dict keys 1328 for shift in range(maxshift + 1): 1329 t1 = [] 1330 t2 = [] 1331 size = 2**shift 1332 bincache = {} 1333 for i in range(0, len(t), size): 1334 bin = t[i:i+size] 1335 index = bincache.get(bin) 1336 if index is None: 1337 index = len(t2) 1338 bincache[bin] = index 1339 t2.extend(bin) 1340 t1.append(index >> shift) 1341 # determine memory size 1342 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 1343 if trace > 1: 1344 dump(t1, t2, shift, b) 1345 if b < bytes: 1346 best = t1, t2, shift 1347 bytes = b 1348 t1, t2, shift = best 1349 if trace: 1350 print("Best:", end=' ', file=sys.stderr) 1351 dump(t1, t2, shift, bytes) 1352 if __debug__: 1353 # exhaustively verify that the decomposition is correct 1354 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 1355 for i in range(len(t)): 1356 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1357 return best 1358 1359 1360if __name__ == "__main__": 1361 maketables(1) 1362