1# 2# (re)generate unicode property and type databases 3# 4# This script converts Unicode database files to Modules/unicodedata_db.h, 5# Modules/unicodename_db.h, and Objects/unicodetype_db.h 6# 7# history: 8# 2000-09-24 fl created (based on bits and pieces from unidb) 9# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 10# 2000-09-25 fl added character type table 11# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 12# 2000-11-03 fl expand first/last ranges 13# 2001-01-19 fl added character name tables (2.1) 14# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 15# 2002-09-11 wd use string methods 16# 2002-10-18 mvl update to Unicode 3.2 17# 2002-10-22 mvl generate NFC tables 18# 2002-11-24 mvl expand all ranges, sort names version-independently 19# 2002-11-25 mvl add UNIDATA_VERSION 20# 2004-05-29 perky add east asian width information 21# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta 22# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch 23# 2011-10-21 ezio add support for name aliases and named sequences 24# 2012-01 benjamin add full case mappings 25# 26# written by Fredrik Lundh (fredrik@pythonware.com) 27# 28 29import os 30import sys 31import zipfile 32 33from textwrap import dedent 34from functools import partial 35 36SCRIPT = sys.argv[0] 37VERSION = "3.3" 38 39# The Unicode Database 40# -------------------- 41# When changing UCD version please update 42# * Doc/library/stdtypes.rst, and 43# * Doc/library/unicodedata.rst 44# * Doc/reference/lexical_analysis.rst (two occurrences) 45UNIDATA_VERSION = "12.1.0" 46UNICODE_DATA = "UnicodeData%s.txt" 47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 48EASTASIAN_WIDTH = "EastAsianWidth%s.txt" 49UNIHAN = "Unihan%s.zip" 50DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" 51DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 52LINE_BREAK = "LineBreak%s.txt" 53NAME_ALIASES = "NameAliases%s.txt" 54NAMED_SEQUENCES = "NamedSequences%s.txt" 55SPECIAL_CASING = "SpecialCasing%s.txt" 56CASE_FOLDING = "CaseFolding%s.txt" 57 58# Private Use Areas -- in planes 1, 15, 16 59PUA_1 = range(0xE000, 0xF900) 60PUA_15 = range(0xF0000, 0xFFFFE) 61PUA_16 = range(0x100000, 0x10FFFE) 62 63# we use this ranges of PUA_15 to store name aliases and named sequences 64NAME_ALIASES_START = 0xF0000 65NAMED_SEQUENCES_START = 0xF0200 66 67old_versions = ["3.2.0"] 68 69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 72 "So" ] 73 74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 76 "ON", "LRI", "RLI", "FSI", "PDI" ] 77 78EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] 79 80MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 81 82# note: should match definitions in Objects/unicodectype.c 83ALPHA_MASK = 0x01 84DECIMAL_MASK = 0x02 85DIGIT_MASK = 0x04 86LOWER_MASK = 0x08 87LINEBREAK_MASK = 0x10 88SPACE_MASK = 0x20 89TITLE_MASK = 0x40 90UPPER_MASK = 0x80 91XID_START_MASK = 0x100 92XID_CONTINUE_MASK = 0x200 93PRINTABLE_MASK = 0x400 94NUMERIC_MASK = 0x800 95CASE_IGNORABLE_MASK = 0x1000 96CASED_MASK = 0x2000 97EXTENDED_CASE_MASK = 0x4000 98 99# these ranges need to match unicodedata.c:is_unified_ideograph 100cjk_ranges = [ 101 ('3400', '4DB5'), 102 ('4E00', '9FEF'), 103 ('20000', '2A6D6'), 104 ('2A700', '2B734'), 105 ('2B740', '2B81D'), 106 ('2B820', '2CEA1'), 107 ('2CEB0', '2EBE0'), 108] 109 110 111def maketables(trace=0): 112 113 print("--- Reading", UNICODE_DATA % "", "...") 114 115 unicode = UnicodeData(UNIDATA_VERSION) 116 117 print(len(list(filter(None, unicode.table))), "characters") 118 119 for version in old_versions: 120 print("--- Reading", UNICODE_DATA % ("-"+version), "...") 121 old_unicode = UnicodeData(version, cjk_check=False) 122 print(len(list(filter(None, old_unicode.table))), "characters") 123 merge_old_version(version, unicode, old_unicode) 124 125 makeunicodename(unicode, trace) 126 makeunicodedata(unicode, trace) 127 makeunicodetype(unicode, trace) 128 129 130# -------------------------------------------------------------------- 131# unicode character properties 132 133def makeunicodedata(unicode, trace): 134 135 dummy = (0, 0, 0, 0, 0, 0) 136 table = [dummy] 137 cache = {0: dummy} 138 index = [0] * len(unicode.chars) 139 140 FILE = "Modules/unicodedata_db.h" 141 142 print("--- Preparing", FILE, "...") 143 144 # 1) database properties 145 146 for char in unicode.chars: 147 record = unicode.table[char] 148 if record: 149 # extract database properties 150 category = CATEGORY_NAMES.index(record[2]) 151 combining = int(record[3]) 152 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) 153 mirrored = record[9] == "Y" 154 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) 155 normalizationquickcheck = record[17] 156 item = ( 157 category, combining, bidirectional, mirrored, eastasianwidth, 158 normalizationquickcheck 159 ) 160 # add entry to index and item tables 161 i = cache.get(item) 162 if i is None: 163 cache[item] = i = len(table) 164 table.append(item) 165 index[char] = i 166 167 # 2) decomposition data 168 169 decomp_data = [0] 170 decomp_prefix = [""] 171 decomp_index = [0] * len(unicode.chars) 172 decomp_size = 0 173 174 comp_pairs = [] 175 comp_first = [None] * len(unicode.chars) 176 comp_last = [None] * len(unicode.chars) 177 178 for char in unicode.chars: 179 record = unicode.table[char] 180 if record: 181 if record[5]: 182 decomp = record[5].split() 183 if len(decomp) > 19: 184 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) 185 # prefix 186 if decomp[0][0] == "<": 187 prefix = decomp.pop(0) 188 else: 189 prefix = "" 190 try: 191 i = decomp_prefix.index(prefix) 192 except ValueError: 193 i = len(decomp_prefix) 194 decomp_prefix.append(prefix) 195 prefix = i 196 assert prefix < 256 197 # content 198 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 199 # Collect NFC pairs 200 if not prefix and len(decomp) == 3 and \ 201 char not in unicode.exclusions and \ 202 unicode.table[decomp[1]][3] == "0": 203 p, l, r = decomp 204 comp_first[l] = 1 205 comp_last[r] = 1 206 comp_pairs.append((l,r,char)) 207 try: 208 i = decomp_data.index(decomp) 209 except ValueError: 210 i = len(decomp_data) 211 decomp_data.extend(decomp) 212 decomp_size = decomp_size + len(decomp) * 2 213 else: 214 i = 0 215 decomp_index[char] = i 216 217 f = l = 0 218 comp_first_ranges = [] 219 comp_last_ranges = [] 220 prev_f = prev_l = None 221 for i in unicode.chars: 222 if comp_first[i] is not None: 223 comp_first[i] = f 224 f += 1 225 if prev_f is None: 226 prev_f = (i,i) 227 elif prev_f[1]+1 == i: 228 prev_f = prev_f[0],i 229 else: 230 comp_first_ranges.append(prev_f) 231 prev_f = (i,i) 232 if comp_last[i] is not None: 233 comp_last[i] = l 234 l += 1 235 if prev_l is None: 236 prev_l = (i,i) 237 elif prev_l[1]+1 == i: 238 prev_l = prev_l[0],i 239 else: 240 comp_last_ranges.append(prev_l) 241 prev_l = (i,i) 242 comp_first_ranges.append(prev_f) 243 comp_last_ranges.append(prev_l) 244 total_first = f 245 total_last = l 246 247 comp_data = [0]*(total_first*total_last) 248 for f,l,char in comp_pairs: 249 f = comp_first[f] 250 l = comp_last[l] 251 comp_data[f*total_last+l] = char 252 253 print(len(table), "unique properties") 254 print(len(decomp_prefix), "unique decomposition prefixes") 255 print(len(decomp_data), "unique decomposition entries:", end=' ') 256 print(decomp_size, "bytes") 257 print(total_first, "first characters in NFC") 258 print(total_last, "last characters in NFC") 259 print(len(comp_pairs), "NFC pairs") 260 261 print("--- Writing", FILE, "...") 262 263 with open(FILE, "w") as fp: 264 fprint = partial(print, file=fp) 265 266 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 267 fprint() 268 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION) 269 fprint("/* a list of unique database records */") 270 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {") 271 for item in table: 272 fprint(" {%d, %d, %d, %d, %d, %d}," % item) 273 fprint("};") 274 fprint() 275 276 fprint("/* Reindexing of NFC first characters. */") 277 fprint("#define TOTAL_FIRST",total_first) 278 fprint("#define TOTAL_LAST",total_last) 279 fprint("struct reindex{int start;short count,index;};") 280 fprint("static struct reindex nfc_first[] = {") 281 for start,end in comp_first_ranges: 282 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start])) 283 fprint(" {0,0,0}") 284 fprint("};\n") 285 fprint("static struct reindex nfc_last[] = {") 286 for start,end in comp_last_ranges: 287 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start])) 288 fprint(" {0,0,0}") 289 fprint("};\n") 290 291 # FIXME: <fl> the following tables could be made static, and 292 # the support code moved into unicodedatabase.c 293 294 fprint("/* string literals */") 295 fprint("const char *_PyUnicode_CategoryNames[] = {") 296 for name in CATEGORY_NAMES: 297 fprint(" \"%s\"," % name) 298 fprint(" NULL") 299 fprint("};") 300 301 fprint("const char *_PyUnicode_BidirectionalNames[] = {") 302 for name in BIDIRECTIONAL_NAMES: 303 fprint(" \"%s\"," % name) 304 fprint(" NULL") 305 fprint("};") 306 307 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {") 308 for name in EASTASIANWIDTH_NAMES: 309 fprint(" \"%s\"," % name) 310 fprint(" NULL") 311 fprint("};") 312 313 fprint("static const char *decomp_prefix[] = {") 314 for name in decomp_prefix: 315 fprint(" \"%s\"," % name) 316 fprint(" NULL") 317 fprint("};") 318 319 # split record index table 320 index1, index2, shift = splitbins(index, trace) 321 322 fprint("/* index tables for the database records */") 323 fprint("#define SHIFT", shift) 324 Array("index1", index1).dump(fp, trace) 325 Array("index2", index2).dump(fp, trace) 326 327 # split decomposition index table 328 index1, index2, shift = splitbins(decomp_index, trace) 329 330 fprint("/* decomposition data */") 331 Array("decomp_data", decomp_data).dump(fp, trace) 332 333 fprint("/* index tables for the decomposition data */") 334 fprint("#define DECOMP_SHIFT", shift) 335 Array("decomp_index1", index1).dump(fp, trace) 336 Array("decomp_index2", index2).dump(fp, trace) 337 338 index, index2, shift = splitbins(comp_data, trace) 339 fprint("/* NFC pairs */") 340 fprint("#define COMP_SHIFT", shift) 341 Array("comp_index", index).dump(fp, trace) 342 Array("comp_data", index2).dump(fp, trace) 343 344 # Generate delta tables for old versions 345 for version, table, normalization in unicode.changed: 346 cversion = version.replace(".","_") 347 records = [table[0]] 348 cache = {table[0]:0} 349 index = [0] * len(table) 350 for i, record in enumerate(table): 351 try: 352 index[i] = cache[record] 353 except KeyError: 354 index[i] = cache[record] = len(records) 355 records.append(record) 356 index1, index2, shift = splitbins(index, trace) 357 fprint("static const change_record change_records_%s[] = {" % cversion) 358 for record in records: 359 fprint(" { %s }," % ", ".join(map(str,record))) 360 fprint("};") 361 Array("changes_%s_index" % cversion, index1).dump(fp, trace) 362 Array("changes_%s_data" % cversion, index2).dump(fp, trace) 363 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion) 364 fprint("{") 365 fprint(" int index;") 366 fprint(" if (n >= 0x110000) index = 0;") 367 fprint(" else {") 368 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift)) 369 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \ 370 (cversion, shift, ((1<<shift)-1))) 371 fprint(" }") 372 fprint(" return change_records_%s+index;" % cversion) 373 fprint("}\n") 374 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion) 375 fprint("{") 376 fprint(" switch(n) {") 377 for k, v in normalization: 378 fprint(" case %s: return 0x%s;" % (hex(k), v)) 379 fprint(" default: return 0;") 380 fprint(" }\n}\n") 381 382 383# -------------------------------------------------------------------- 384# unicode character type tables 385 386def makeunicodetype(unicode, trace): 387 388 FILE = "Objects/unicodetype_db.h" 389 390 print("--- Preparing", FILE, "...") 391 392 # extract unicode types 393 dummy = (0, 0, 0, 0, 0, 0) 394 table = [dummy] 395 cache = {0: dummy} 396 index = [0] * len(unicode.chars) 397 numeric = {} 398 spaces = [] 399 linebreaks = [] 400 extra_casing = [] 401 402 for char in unicode.chars: 403 record = unicode.table[char] 404 if record: 405 # extract database properties 406 category = record[2] 407 bidirectional = record[4] 408 properties = record[16] 409 flags = 0 410 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 411 flags |= ALPHA_MASK 412 if "Lowercase" in properties: 413 flags |= LOWER_MASK 414 if 'Line_Break' in properties or bidirectional == "B": 415 flags |= LINEBREAK_MASK 416 linebreaks.append(char) 417 if category == "Zs" or bidirectional in ("WS", "B", "S"): 418 flags |= SPACE_MASK 419 spaces.append(char) 420 if category == "Lt": 421 flags |= TITLE_MASK 422 if "Uppercase" in properties: 423 flags |= UPPER_MASK 424 if char == ord(" ") or category[0] not in ("C", "Z"): 425 flags |= PRINTABLE_MASK 426 if "XID_Start" in properties: 427 flags |= XID_START_MASK 428 if "XID_Continue" in properties: 429 flags |= XID_CONTINUE_MASK 430 if "Cased" in properties: 431 flags |= CASED_MASK 432 if "Case_Ignorable" in properties: 433 flags |= CASE_IGNORABLE_MASK 434 sc = unicode.special_casing.get(char) 435 cf = unicode.case_folding.get(char, [char]) 436 if record[12]: 437 upper = int(record[12], 16) 438 else: 439 upper = char 440 if record[13]: 441 lower = int(record[13], 16) 442 else: 443 lower = char 444 if record[14]: 445 title = int(record[14], 16) 446 else: 447 title = upper 448 if sc is None and cf != [lower]: 449 sc = ([lower], [title], [upper]) 450 if sc is None: 451 if upper == lower == title: 452 upper = lower = title = 0 453 else: 454 upper = upper - char 455 lower = lower - char 456 title = title - char 457 assert (abs(upper) <= 2147483647 and 458 abs(lower) <= 2147483647 and 459 abs(title) <= 2147483647) 460 else: 461 # This happens either when some character maps to more than one 462 # character in uppercase, lowercase, or titlecase or the 463 # casefolded version of the character is different from the 464 # lowercase. The extra characters are stored in a different 465 # array. 466 flags |= EXTENDED_CASE_MASK 467 lower = len(extra_casing) | (len(sc[0]) << 24) 468 extra_casing.extend(sc[0]) 469 if cf != sc[0]: 470 lower |= len(cf) << 20 471 extra_casing.extend(cf) 472 upper = len(extra_casing) | (len(sc[2]) << 24) 473 extra_casing.extend(sc[2]) 474 # Title is probably equal to upper. 475 if sc[1] == sc[2]: 476 title = upper 477 else: 478 title = len(extra_casing) | (len(sc[1]) << 24) 479 extra_casing.extend(sc[1]) 480 # decimal digit, integer digit 481 decimal = 0 482 if record[6]: 483 flags |= DECIMAL_MASK 484 decimal = int(record[6]) 485 digit = 0 486 if record[7]: 487 flags |= DIGIT_MASK 488 digit = int(record[7]) 489 if record[8]: 490 flags |= NUMERIC_MASK 491 numeric.setdefault(record[8], []).append(char) 492 item = ( 493 upper, lower, title, decimal, digit, flags 494 ) 495 # add entry to index and item tables 496 i = cache.get(item) 497 if i is None: 498 cache[item] = i = len(table) 499 table.append(item) 500 index[char] = i 501 502 print(len(table), "unique character type entries") 503 print(sum(map(len, numeric.values())), "numeric code points") 504 print(len(spaces), "whitespace code points") 505 print(len(linebreaks), "linebreak code points") 506 print(len(extra_casing), "extended case array") 507 508 print("--- Writing", FILE, "...") 509 510 with open(FILE, "w") as fp: 511 fprint = partial(print, file=fp) 512 513 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 514 fprint() 515 fprint("/* a list of unique character type descriptors */") 516 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {") 517 for item in table: 518 fprint(" {%d, %d, %d, %d, %d, %d}," % item) 519 fprint("};") 520 fprint() 521 522 fprint("/* extended case mappings */") 523 fprint() 524 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {") 525 for c in extra_casing: 526 fprint(" %d," % c) 527 fprint("};") 528 fprint() 529 530 # split decomposition index table 531 index1, index2, shift = splitbins(index, trace) 532 533 fprint("/* type indexes */") 534 fprint("#define SHIFT", shift) 535 Array("index1", index1).dump(fp, trace) 536 Array("index2", index2).dump(fp, trace) 537 538 # Generate code for _PyUnicode_ToNumeric() 539 numeric_items = sorted(numeric.items()) 540 fprint('/* Returns the numeric value as double for Unicode characters') 541 fprint(' * having this property, -1.0 otherwise.') 542 fprint(' */') 543 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)') 544 fprint('{') 545 fprint(' switch (ch) {') 546 for value, codepoints in numeric_items: 547 # Turn text into float literals 548 parts = value.split('/') 549 parts = [repr(float(part)) for part in parts] 550 value = '/'.join(parts) 551 552 codepoints.sort() 553 for codepoint in codepoints: 554 fprint(' case 0x%04X:' % (codepoint,)) 555 fprint(' return (double) %s;' % (value,)) 556 fprint(' }') 557 fprint(' return -1.0;') 558 fprint('}') 559 fprint() 560 561 # Generate code for _PyUnicode_IsWhitespace() 562 fprint("/* Returns 1 for Unicode characters having the bidirectional") 563 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.") 564 fprint(" */") 565 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)') 566 fprint('{') 567 fprint(' switch (ch) {') 568 569 for codepoint in sorted(spaces): 570 fprint(' case 0x%04X:' % (codepoint,)) 571 fprint(' return 1;') 572 573 fprint(' }') 574 fprint(' return 0;') 575 fprint('}') 576 fprint() 577 578 # Generate code for _PyUnicode_IsLinebreak() 579 fprint("/* Returns 1 for Unicode characters having the line break") 580 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") 581 fprint(" * type 'B', 0 otherwise.") 582 fprint(" */") 583 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') 584 fprint('{') 585 fprint(' switch (ch) {') 586 for codepoint in sorted(linebreaks): 587 fprint(' case 0x%04X:' % (codepoint,)) 588 fprint(' return 1;') 589 590 fprint(' }') 591 fprint(' return 0;') 592 fprint('}') 593 fprint() 594 595 596# -------------------------------------------------------------------- 597# unicode name database 598 599def makeunicodename(unicode, trace): 600 601 FILE = "Modules/unicodename_db.h" 602 603 print("--- Preparing", FILE, "...") 604 605 # collect names 606 names = [None] * len(unicode.chars) 607 608 for char in unicode.chars: 609 record = unicode.table[char] 610 if record: 611 name = record[1].strip() 612 if name and name[0] != "<": 613 names[char] = name + chr(0) 614 615 print(len([n for n in names if n is not None]), "distinct names") 616 617 # collect unique words from names (note that we differ between 618 # words inside a sentence, and words ending a sentence. the 619 # latter includes the trailing null byte. 620 621 words = {} 622 n = b = 0 623 for char in unicode.chars: 624 name = names[char] 625 if name: 626 w = name.split() 627 b = b + len(name) 628 n = n + len(w) 629 for w in w: 630 l = words.get(w) 631 if l: 632 l.append(None) 633 else: 634 words[w] = [len(words)] 635 636 print(n, "words in text;", b, "bytes") 637 638 wordlist = list(words.items()) 639 640 # sort on falling frequency, then by name 641 def word_key(a): 642 aword, alist = a 643 return -len(alist), aword 644 wordlist.sort(key=word_key) 645 646 # figure out how many phrasebook escapes we need 647 escapes = 0 648 while escapes * 256 < len(wordlist): 649 escapes = escapes + 1 650 print(escapes, "escapes") 651 652 short = 256 - escapes 653 654 assert short > 0 655 656 print(short, "short indexes in lexicon") 657 658 # statistics 659 n = 0 660 for i in range(short): 661 n = n + len(wordlist[i][1]) 662 print(n, "short indexes in phrasebook") 663 664 # pick the most commonly used words, and sort the rest on falling 665 # length (to maximize overlap) 666 667 wordlist, wordtail = wordlist[:short], wordlist[short:] 668 wordtail.sort(key=lambda a: a[0], reverse=True) 669 wordlist.extend(wordtail) 670 671 # generate lexicon from words 672 673 lexicon_offset = [0] 674 lexicon = "" 675 words = {} 676 677 # build a lexicon string 678 offset = 0 679 for w, x in wordlist: 680 # encoding: bit 7 indicates last character in word (chr(128) 681 # indicates the last character in an entire string) 682 ww = w[:-1] + chr(ord(w[-1])+128) 683 # reuse string tails, when possible 684 o = lexicon.find(ww) 685 if o < 0: 686 o = offset 687 lexicon = lexicon + ww 688 offset = offset + len(w) 689 words[w] = len(lexicon_offset) 690 lexicon_offset.append(o) 691 692 lexicon = list(map(ord, lexicon)) 693 694 # generate phrasebook from names and lexicon 695 phrasebook = [0] 696 phrasebook_offset = [0] * len(unicode.chars) 697 for char in unicode.chars: 698 name = names[char] 699 if name: 700 w = name.split() 701 phrasebook_offset[char] = len(phrasebook) 702 for w in w: 703 i = words[w] 704 if i < short: 705 phrasebook.append(i) 706 else: 707 # store as two bytes 708 phrasebook.append((i>>8) + short) 709 phrasebook.append(i&255) 710 711 assert getsize(phrasebook) == 1 712 713 # 714 # unicode name hash table 715 716 # extract names 717 data = [] 718 for char in unicode.chars: 719 record = unicode.table[char] 720 if record: 721 name = record[1].strip() 722 if name and name[0] != "<": 723 data.append((name, char)) 724 725 # the magic number 47 was chosen to minimize the number of 726 # collisions on the current data set. if you like, change it 727 # and see what happens... 728 729 codehash = Hash("code", data, 47) 730 731 print("--- Writing", FILE, "...") 732 733 with open(FILE, "w") as fp: 734 fprint = partial(print, file=fp) 735 736 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) 737 fprint() 738 fprint("#define NAME_MAXLEN", 256) 739 fprint() 740 fprint("/* lexicon */") 741 Array("lexicon", lexicon).dump(fp, trace) 742 Array("lexicon_offset", lexicon_offset).dump(fp, trace) 743 744 # split decomposition index table 745 offset1, offset2, shift = splitbins(phrasebook_offset, trace) 746 747 fprint("/* code->name phrasebook */") 748 fprint("#define phrasebook_shift", shift) 749 fprint("#define phrasebook_short", short) 750 751 Array("phrasebook", phrasebook).dump(fp, trace) 752 Array("phrasebook_offset1", offset1).dump(fp, trace) 753 Array("phrasebook_offset2", offset2).dump(fp, trace) 754 755 fprint("/* name->code dictionary */") 756 codehash.dump(fp, trace) 757 758 fprint() 759 fprint('static const unsigned int aliases_start = %#x;' % 760 NAME_ALIASES_START) 761 fprint('static const unsigned int aliases_end = %#x;' % 762 (NAME_ALIASES_START + len(unicode.aliases))) 763 764 fprint('static const unsigned int name_aliases[] = {') 765 for name, codepoint in unicode.aliases: 766 fprint(' 0x%04X,' % codepoint) 767 fprint('};') 768 769 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, 770 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer 771 # sequences or sequences with non-BMP chars are added. 772 # unicodedata_lookup should be adapted too. 773 fprint(dedent(""" 774 typedef struct NamedSequence { 775 int seqlen; 776 Py_UCS2 seq[4]; 777 } named_sequence; 778 """)) 779 780 fprint('static const unsigned int named_sequences_start = %#x;' % 781 NAMED_SEQUENCES_START) 782 fprint('static const unsigned int named_sequences_end = %#x;' % 783 (NAMED_SEQUENCES_START + len(unicode.named_sequences))) 784 785 fprint('static const named_sequence named_sequences[] = {') 786 for name, sequence in unicode.named_sequences: 787 seq_str = ', '.join('0x%04X' % cp for cp in sequence) 788 fprint(' {%d, {%s}},' % (len(sequence), seq_str)) 789 fprint('};') 790 791 792def merge_old_version(version, new, old): 793 # Changes to exclusion file not implemented yet 794 if old.exclusions != new.exclusions: 795 raise NotImplementedError("exclusions differ") 796 797 # In these change records, 0xFF means "no change" 798 bidir_changes = [0xFF]*0x110000 799 category_changes = [0xFF]*0x110000 800 decimal_changes = [0xFF]*0x110000 801 mirrored_changes = [0xFF]*0x110000 802 east_asian_width_changes = [0xFF]*0x110000 803 # In numeric data, 0 means "no change", 804 # -1 means "did not have a numeric value 805 numeric_changes = [0] * 0x110000 806 # normalization_changes is a list of key-value pairs 807 normalization_changes = [] 808 for i in range(0x110000): 809 if new.table[i] is None: 810 # Characters unassigned in the new version ought to 811 # be unassigned in the old one 812 assert old.table[i] is None 813 continue 814 # check characters unassigned in the old version 815 if old.table[i] is None: 816 # category 0 is "unassigned" 817 category_changes[i] = 0 818 continue 819 # check characters that differ 820 if old.table[i] != new.table[i]: 821 for k in range(len(old.table[i])): 822 if old.table[i][k] != new.table[i][k]: 823 value = old.table[i][k] 824 if k == 1 and i in PUA_15: 825 # the name is not set in the old.table, but in the 826 # new.table we are using it for aliases and named seq 827 assert value == '' 828 elif k == 2: 829 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] 830 category_changes[i] = CATEGORY_NAMES.index(value) 831 elif k == 4: 832 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k] 833 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) 834 elif k == 5: 835 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k] 836 # We assume that all normalization changes are in 1:1 mappings 837 assert " " not in value 838 normalization_changes.append((i, value)) 839 elif k == 6: 840 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k] 841 # we only support changes where the old value is a single digit 842 assert value in "0123456789" 843 decimal_changes[i] = int(value) 844 elif k == 8: 845 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] 846 # Since 0 encodes "no change", the old value is better not 0 847 if not value: 848 numeric_changes[i] = -1 849 else: 850 numeric_changes[i] = float(value) 851 assert numeric_changes[i] not in (0, -1) 852 elif k == 9: 853 if value == 'Y': 854 mirrored_changes[i] = '1' 855 else: 856 mirrored_changes[i] = '0' 857 elif k == 11: 858 # change to ISO comment, ignore 859 pass 860 elif k == 12: 861 # change to simple uppercase mapping; ignore 862 pass 863 elif k == 13: 864 # change to simple lowercase mapping; ignore 865 pass 866 elif k == 14: 867 # change to simple titlecase mapping; ignore 868 pass 869 elif k == 15: 870 # change to east asian width 871 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value) 872 elif k == 16: 873 # derived property changes; not yet 874 pass 875 elif k == 17: 876 # normalization quickchecks are not performed 877 # for older versions 878 pass 879 else: 880 class Difference(Exception):pass 881 raise Difference(hex(i), k, old.table[i], new.table[i]) 882 new.changed.append((version, list(zip(bidir_changes, category_changes, 883 decimal_changes, mirrored_changes, 884 east_asian_width_changes, 885 numeric_changes)), 886 normalization_changes)) 887 888 889def open_data(template, version): 890 local = template % ('-'+version,) 891 if not os.path.exists(local): 892 import urllib.request 893 if version == '3.2.0': 894 # irregular url structure 895 url = 'http://www.unicode.org/Public/3.2-Update/' + local 896 else: 897 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '') 898 urllib.request.urlretrieve(url, filename=local) 899 if local.endswith('.txt'): 900 return open(local, encoding='utf-8') 901 else: 902 # Unihan.zip 903 return open(local, 'rb') 904 905 906# -------------------------------------------------------------------- 907# the following support code is taken from the unidb utilities 908# Copyright (c) 1999-2000 by Secret Labs AB 909 910# load a unicode-data file from disk 911 912class UnicodeData: 913 # Record structure: 914 # [ID, name, category, combining, bidi, decomp, (6) 915 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) 916 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) 917 # derived-props] (17) 918 919 def __init__(self, version, 920 linebreakprops=False, 921 expand=1, 922 cjk_check=True): 923 self.changed = [] 924 table = [None] * 0x110000 925 with open_data(UNICODE_DATA, version) as file: 926 while 1: 927 s = file.readline() 928 if not s: 929 break 930 s = s.strip().split(";") 931 char = int(s[0], 16) 932 table[char] = s 933 934 cjk_ranges_found = [] 935 936 # expand first-last ranges 937 if expand: 938 field = None 939 for i in range(0, 0x110000): 940 s = table[i] 941 if s: 942 if s[1][-6:] == "First>": 943 s[1] = "" 944 field = s 945 elif s[1][-5:] == "Last>": 946 if s[1].startswith("<CJK Ideograph"): 947 cjk_ranges_found.append((field[0], 948 s[0])) 949 s[1] = "" 950 field = None 951 elif field: 952 f2 = field[:] 953 f2[0] = "%X" % i 954 table[i] = f2 955 if cjk_check and cjk_ranges != cjk_ranges_found: 956 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) 957 958 # public attributes 959 self.filename = UNICODE_DATA % '' 960 self.table = table 961 self.chars = list(range(0x110000)) # unicode 3.2 962 963 # check for name aliases and named sequences, see #12753 964 # aliases and named sequences are not in 3.2.0 965 if version != '3.2.0': 966 self.aliases = [] 967 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, 968 # in order to take advantage of the compression and lookup 969 # algorithms used for the other characters 970 pua_index = NAME_ALIASES_START 971 with open_data(NAME_ALIASES, version) as file: 972 for s in file: 973 s = s.strip() 974 if not s or s.startswith('#'): 975 continue 976 char, name, abbrev = s.split(';') 977 char = int(char, 16) 978 self.aliases.append((name, char)) 979 # also store the name in the PUA 1 980 self.table[pua_index][1] = name 981 pua_index += 1 982 assert pua_index - NAME_ALIASES_START == len(self.aliases) 983 984 self.named_sequences = [] 985 # store named sequences in the PUA 1, in range U+F0100.., 986 # in order to take advantage of the compression and lookup 987 # algorithms used for the other characters. 988 989 assert pua_index < NAMED_SEQUENCES_START 990 pua_index = NAMED_SEQUENCES_START 991 with open_data(NAMED_SEQUENCES, version) as file: 992 for s in file: 993 s = s.strip() 994 if not s or s.startswith('#'): 995 continue 996 name, chars = s.split(';') 997 chars = tuple(int(char, 16) for char in chars.split()) 998 # check that the structure defined in makeunicodename is OK 999 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" 1000 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " 1001 "the NamedSequence struct and in unicodedata_lookup") 1002 self.named_sequences.append((name, chars)) 1003 # also store these in the PUA 1 1004 self.table[pua_index][1] = name 1005 pua_index += 1 1006 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) 1007 1008 self.exclusions = {} 1009 with open_data(COMPOSITION_EXCLUSIONS, version) as file: 1010 for s in file: 1011 s = s.strip() 1012 if not s: 1013 continue 1014 if s[0] == '#': 1015 continue 1016 char = int(s.split()[0],16) 1017 self.exclusions[char] = 1 1018 1019 widths = [None] * 0x110000 1020 with open_data(EASTASIAN_WIDTH, version) as file: 1021 for s in file: 1022 s = s.strip() 1023 if not s: 1024 continue 1025 if s[0] == '#': 1026 continue 1027 s = s.split()[0].split(';') 1028 if '..' in s[0]: 1029 first, last = [int(c, 16) for c in s[0].split('..')] 1030 chars = list(range(first, last+1)) 1031 else: 1032 chars = [int(s[0], 16)] 1033 for char in chars: 1034 widths[char] = s[1] 1035 1036 for i in range(0, 0x110000): 1037 if table[i] is not None: 1038 table[i].append(widths[i]) 1039 1040 for i in range(0, 0x110000): 1041 if table[i] is not None: 1042 table[i].append(set()) 1043 1044 with open_data(DERIVED_CORE_PROPERTIES, version) as file: 1045 for s in file: 1046 s = s.split('#', 1)[0].strip() 1047 if not s: 1048 continue 1049 1050 r, p = s.split(";") 1051 r = r.strip() 1052 p = p.strip() 1053 if ".." in r: 1054 first, last = [int(c, 16) for c in r.split('..')] 1055 chars = list(range(first, last+1)) 1056 else: 1057 chars = [int(r, 16)] 1058 for char in chars: 1059 if table[char]: 1060 # Some properties (e.g. Default_Ignorable_Code_Point) 1061 # apply to unassigned code points; ignore them 1062 table[char][-1].add(p) 1063 1064 with open_data(LINE_BREAK, version) as file: 1065 for s in file: 1066 s = s.partition('#')[0] 1067 s = [i.strip() for i in s.split(';')] 1068 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: 1069 continue 1070 if '..' not in s[0]: 1071 first = last = int(s[0], 16) 1072 else: 1073 first, last = [int(c, 16) for c in s[0].split('..')] 1074 for char in range(first, last+1): 1075 table[char][-1].add('Line_Break') 1076 1077 # We only want the quickcheck properties 1078 # Format: NF?_QC; Y(es)/N(o)/M(aybe) 1079 # Yes is the default, hence only N and M occur 1080 # In 3.2.0, the format was different (NF?_NO) 1081 # The parsing will incorrectly determine these as 1082 # "yes", however, unicodedata.c will not perform quickchecks 1083 # for older versions, and no delta records will be created. 1084 quickchecks = [0] * 0x110000 1085 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 1086 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: 1087 for s in file: 1088 if '#' in s: 1089 s = s[:s.index('#')] 1090 s = [i.strip() for i in s.split(';')] 1091 if len(s) < 2 or s[1] not in qc_order: 1092 continue 1093 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 1094 quickcheck_shift = qc_order.index(s[1])*2 1095 quickcheck <<= quickcheck_shift 1096 if '..' not in s[0]: 1097 first = last = int(s[0], 16) 1098 else: 1099 first, last = [int(c, 16) for c in s[0].split('..')] 1100 for char in range(first, last+1): 1101 assert not (quickchecks[char]>>quickcheck_shift)&3 1102 quickchecks[char] |= quickcheck 1103 for i in range(0, 0x110000): 1104 if table[i] is not None: 1105 table[i].append(quickchecks[i]) 1106 1107 with open_data(UNIHAN, version) as file: 1108 zip = zipfile.ZipFile(file) 1109 if version == '3.2.0': 1110 data = zip.open('Unihan-3.2.0.txt').read() 1111 else: 1112 data = zip.open('Unihan_NumericValues.txt').read() 1113 for line in data.decode("utf-8").splitlines(): 1114 if not line.startswith('U+'): 1115 continue 1116 code, tag, value = line.split(None, 3)[:3] 1117 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 1118 'kOtherNumeric'): 1119 continue 1120 value = value.strip().replace(',', '') 1121 i = int(code[2:], 16) 1122 # Patch the numeric field 1123 if table[i] is not None: 1124 table[i][8] = value 1125 sc = self.special_casing = {} 1126 with open_data(SPECIAL_CASING, version) as file: 1127 for s in file: 1128 s = s[:-1].split('#', 1)[0] 1129 if not s: 1130 continue 1131 data = s.split("; ") 1132 if data[4]: 1133 # We ignore all conditionals (since they depend on 1134 # languages) except for one, which is hardcoded. See 1135 # handle_capital_sigma in unicodeobject.c. 1136 continue 1137 c = int(data[0], 16) 1138 lower = [int(char, 16) for char in data[1].split()] 1139 title = [int(char, 16) for char in data[2].split()] 1140 upper = [int(char, 16) for char in data[3].split()] 1141 sc[c] = (lower, title, upper) 1142 cf = self.case_folding = {} 1143 if version != '3.2.0': 1144 with open_data(CASE_FOLDING, version) as file: 1145 for s in file: 1146 s = s[:-1].split('#', 1)[0] 1147 if not s: 1148 continue 1149 data = s.split("; ") 1150 if data[1] in "CF": 1151 c = int(data[0], 16) 1152 cf[c] = [int(char, 16) for char in data[2].split()] 1153 1154 def uselatin1(self): 1155 # restrict character range to ISO Latin 1 1156 self.chars = list(range(256)) 1157 1158 1159# hash table tools 1160 1161# this is a straight-forward reimplementation of Python's built-in 1162# dictionary type, using a static data structure, and a custom string 1163# hash algorithm. 1164 1165def myhash(s, magic): 1166 h = 0 1167 for c in map(ord, s.upper()): 1168 h = (h * magic) + c 1169 ix = h & 0xff000000 1170 if ix: 1171 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff 1172 return h 1173 1174 1175SIZES = [ 1176 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), 1177 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), 1178 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), 1179 (2097152,5), (4194304,3), (8388608,33), (16777216,27) 1180] 1181 1182 1183class Hash: 1184 def __init__(self, name, data, magic): 1185 # turn a (key, value) list into a static hash table structure 1186 1187 # determine table size 1188 for size, poly in SIZES: 1189 if size > len(data): 1190 poly = size + poly 1191 break 1192 else: 1193 raise AssertionError("ran out of polynomials") 1194 1195 print(size, "slots in hash table") 1196 1197 table = [None] * size 1198 1199 mask = size-1 1200 1201 n = 0 1202 1203 hash = myhash 1204 1205 # initialize hash table 1206 for key, value in data: 1207 h = hash(key, magic) 1208 i = (~h) & mask 1209 v = table[i] 1210 if v is None: 1211 table[i] = value 1212 continue 1213 incr = (h ^ (h >> 3)) & mask 1214 if not incr: 1215 incr = mask 1216 while 1: 1217 n = n + 1 1218 i = (i + incr) & mask 1219 v = table[i] 1220 if v is None: 1221 table[i] = value 1222 break 1223 incr = incr << 1 1224 if incr > mask: 1225 incr = incr ^ poly 1226 1227 print(n, "collisions") 1228 self.collisions = n 1229 1230 for i in range(len(table)): 1231 if table[i] is None: 1232 table[i] = 0 1233 1234 self.data = Array(name + "_hash", table) 1235 self.magic = magic 1236 self.name = name 1237 self.size = size 1238 self.poly = poly 1239 1240 def dump(self, file, trace): 1241 # write data to file, as a C array 1242 self.data.dump(file, trace) 1243 file.write("#define %s_magic %d\n" % (self.name, self.magic)) 1244 file.write("#define %s_size %d\n" % (self.name, self.size)) 1245 file.write("#define %s_poly %d\n" % (self.name, self.poly)) 1246 1247 1248# stuff to deal with arrays of unsigned integers 1249 1250class Array: 1251 1252 def __init__(self, name, data): 1253 self.name = name 1254 self.data = data 1255 1256 def dump(self, file, trace=0): 1257 # write data to file, as a C array 1258 size = getsize(self.data) 1259 if trace: 1260 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr) 1261 file.write("static const ") 1262 if size == 1: 1263 file.write("unsigned char") 1264 elif size == 2: 1265 file.write("unsigned short") 1266 else: 1267 file.write("unsigned int") 1268 file.write(" " + self.name + "[] = {\n") 1269 if self.data: 1270 s = " " 1271 for item in self.data: 1272 i = str(item) + ", " 1273 if len(s) + len(i) > 78: 1274 file.write(s.rstrip() + "\n") 1275 s = " " + i 1276 else: 1277 s = s + i 1278 if s.strip(): 1279 file.write(s.rstrip() + "\n") 1280 file.write("};\n\n") 1281 1282 1283def getsize(data): 1284 # return smallest possible integer size for the given array 1285 maxdata = max(data) 1286 if maxdata < 256: 1287 return 1 1288 elif maxdata < 65536: 1289 return 2 1290 else: 1291 return 4 1292 1293 1294def splitbins(t, trace=0): 1295 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 1296 1297 t is a sequence of ints. This function can be useful to save space if 1298 many of the ints are the same. t1 and t2 are lists of ints, and shift 1299 is an int, chosen to minimize the combined size of t1 and t2 (in C 1300 code), and where for each i in range(len(t)), 1301 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1302 where mask is a bitmask isolating the last "shift" bits. 1303 1304 If optional arg trace is non-zero (default zero), progress info 1305 is printed to sys.stderr. The higher the value, the more info 1306 you'll get. 1307 """ 1308 1309 if trace: 1310 def dump(t1, t2, shift, bytes): 1311 print("%d+%d bins at shift %d; %d bytes" % ( 1312 len(t1), len(t2), shift, bytes), file=sys.stderr) 1313 print("Size of original table:", len(t)*getsize(t), "bytes", 1314 file=sys.stderr) 1315 n = len(t)-1 # last valid index 1316 maxshift = 0 # the most we can shift n and still have something left 1317 if n > 0: 1318 while n >> 1: 1319 n >>= 1 1320 maxshift += 1 1321 del n 1322 bytes = sys.maxsize # smallest total size so far 1323 t = tuple(t) # so slices can be dict keys 1324 for shift in range(maxshift + 1): 1325 t1 = [] 1326 t2 = [] 1327 size = 2**shift 1328 bincache = {} 1329 for i in range(0, len(t), size): 1330 bin = t[i:i+size] 1331 index = bincache.get(bin) 1332 if index is None: 1333 index = len(t2) 1334 bincache[bin] = index 1335 t2.extend(bin) 1336 t1.append(index >> shift) 1337 # determine memory size 1338 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 1339 if trace > 1: 1340 dump(t1, t2, shift, b) 1341 if b < bytes: 1342 best = t1, t2, shift 1343 bytes = b 1344 t1, t2, shift = best 1345 if trace: 1346 print("Best:", end=' ', file=sys.stderr) 1347 dump(t1, t2, shift, bytes) 1348 if __debug__: 1349 # exhaustively verify that the decomposition is correct 1350 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 1351 for i in range(len(t)): 1352 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1353 return best 1354 1355 1356if __name__ == "__main__": 1357 maketables(1) 1358