1#!/usr/bin/env python3 2# flake8: noqa: F821 3 4"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt 5 6Input files: 7* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 9* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt 10* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 11* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 12* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt 14* ms-use/IndicSyllabicCategory-Additional.txt 15* ms-use/IndicPositionalCategory-Additional.txt 16""" 17 18import sys 19 20if len (sys.argv) != 10: 21 sys.exit (__doc__) 22 23DISABLED_SCRIPTS = { 24 'Arabic', 25 'Lao', 26 'Samaritan', 27 'Syriac', 28 'Thai', 29} 30 31files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 32 33headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4] 34for j in range(7, 9): 35 for line in files[j]: 36 line = line.rstrip() 37 if not line: 38 break 39 headers[j - 1].append(line) 40headers.append (["UnicodeData.txt does not have a header."]) 41 42data = [{} for _ in files] 43values = [{} for _ in files] 44for i, f in enumerate (files): 45 for line in f: 46 47 j = line.find ('#') 48 if j >= 0: 49 line = line[:j] 50 51 fields = [x.strip () for x in line.split (';')] 52 if len (fields) == 1: 53 continue 54 55 uu = fields[0].split ('..') 56 start = int (uu[0], 16) 57 if len (uu) == 1: 58 end = start 59 else: 60 end = int (uu[1], 16) 61 62 t = fields[1 if i not in [2, 4] else 2] 63 64 if i == 2: 65 t = 'jt_' + t 66 elif i == 3 and t != 'Default_Ignorable_Code_Point': 67 continue 68 elif i == 7 and t == 'Consonant_Final_Modifier': 69 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336 70 t = 'Syllable_Modifier' 71 elif i == 8 and t == 'NA': 72 t = 'Not_Applicable' 73 74 i0 = i if i < 7 else i - 7 75 for u in range (start, end + 1): 76 data[i0][u] = t 77 values[i0][t] = values[i0].get (t, 0) + end - start + 1 78 79defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown') 80 81# TODO Characters that are not in Unicode Indic files, but used in USE 82data[0][0x1B61] = defaults[0] 83data[0][0x1B63] = defaults[0] 84data[0][0x1B64] = defaults[0] 85data[0][0x1B65] = defaults[0] 86data[0][0x1B66] = defaults[0] 87data[0][0x1B67] = defaults[0] 88data[0][0x1B69] = defaults[0] 89data[0][0x1B6A] = defaults[0] 90data[0][0x2060] = defaults[0] 91# TODO https://github.com/harfbuzz/harfbuzz/pull/1685 92data[0][0x1B5B] = 'Consonant_Placeholder' 93data[0][0x1B5C] = 'Consonant_Placeholder' 94data[0][0x1B5F] = 'Consonant_Placeholder' 95data[0][0x1B62] = 'Consonant_Placeholder' 96data[0][0x1B68] = 'Consonant_Placeholder' 97# TODO https://github.com/harfbuzz/harfbuzz/issues/1035 98data[0][0x11C44] = 'Consonant_Placeholder' 99data[0][0x11C45] = 'Consonant_Placeholder' 100# TODO https://github.com/harfbuzz/harfbuzz/pull/1399 101data[0][0x111C8] = 'Consonant_Placeholder' 102 103# Merge data into one dict: 104for i,v in enumerate (defaults): 105 values[i][v] = values[i].get (v, 0) + 1 106combined = {} 107for i,d in enumerate (data): 108 for u,v in d.items (): 109 if not u in combined: 110 if i >= 4: 111 continue 112 combined[u] = list (defaults) 113 combined[u][i] = v 114combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS} 115data = combined 116del combined 117 118 119property_names = [ 120 # General_Category 121 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 122 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 123 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 124 # Indic_Syllabic_Category 125 'Other', 126 'Bindu', 127 'Visarga', 128 'Avagraha', 129 'Nukta', 130 'Virama', 131 'Pure_Killer', 132 'Invisible_Stacker', 133 'Vowel_Independent', 134 'Vowel_Dependent', 135 'Vowel', 136 'Consonant_Placeholder', 137 'Consonant', 138 'Consonant_Dead', 139 'Consonant_With_Stacker', 140 'Consonant_Prefixed', 141 'Consonant_Preceding_Repha', 142 'Consonant_Succeeding_Repha', 143 'Consonant_Subjoined', 144 'Consonant_Medial', 145 'Consonant_Final', 146 'Consonant_Head_Letter', 147 'Consonant_Initial_Postfixed', 148 'Modifying_Letter', 149 'Tone_Letter', 150 'Tone_Mark', 151 'Gemination_Mark', 152 'Cantillation_Mark', 153 'Register_Shifter', 154 'Syllable_Modifier', 155 'Consonant_Killer', 156 'Non_Joiner', 157 'Joiner', 158 'Number_Joiner', 159 'Number', 160 'Brahmi_Joining_Number', 161 'Hieroglyph', 162 'Hieroglyph_Joiner', 163 'Hieroglyph_Segment_Begin', 164 'Hieroglyph_Segment_End', 165 # Indic_Positional_Category 166 'Not_Applicable', 167 'Right', 168 'Left', 169 'Visual_Order_Left', 170 'Left_And_Right', 171 'Top', 172 'Bottom', 173 'Top_And_Bottom', 174 'Top_And_Bottom_And_Left', 175 'Top_And_Right', 176 'Top_And_Left', 177 'Top_And_Left_And_Right', 178 'Bottom_And_Left', 179 'Bottom_And_Right', 180 'Top_And_Bottom_And_Right', 181 'Overstruck', 182 # Joining_Type 183 'jt_C', 184 'jt_D', 185 'jt_L', 186 'jt_R', 187 'jt_T', 188 'jt_U', 189 'jt_X', 190] 191 192class PropertyValue(object): 193 def __init__(self, name_): 194 self.name = name_ 195 def __str__(self): 196 return self.name 197 def __eq__(self, other): 198 return self.name == (other if isinstance(other, str) else other.name) 199 def __ne__(self, other): 200 return not (self == other) 201 def __hash__(self): 202 return hash(str(self)) 203 204property_values = {} 205 206for name in property_names: 207 value = PropertyValue(name) 208 assert value not in property_values 209 assert value not in globals() 210 property_values[name] = value 211globals().update(property_values) 212 213 214def is_BASE(U, UISC, UDI, UGC, AJT): 215 return (UISC in [Number, Consonant, Consonant_Head_Letter, 216 Tone_Letter, 217 Vowel_Independent, 218 ] or 219 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484 220 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or 221 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 222 Consonant_Subjoined, Vowel, Vowel_Dependent])) 223def is_BASE_NUM(U, UISC, UDI, UGC, AJT): 224 return UISC == Brahmi_Joining_Number 225def is_BASE_OTHER(U, UISC, UDI, UGC, AJT): 226 if UISC == Consonant_Placeholder: return True 227 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 228def is_CGJ(U, UISC, UDI, UGC, AJT): 229 # Also includes VARIATION_SELECTOR, WJ, and ZWJ 230 return U == 0x200D or UDI and UGC in [Mc, Me, Mn] 231def is_CONS_FINAL(U, UISC, UDI, UGC, AJT): 232 return ((UISC == Consonant_Final and UGC != Lo) or 233 UISC == Consonant_Succeeding_Repha) 234def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT): 235 return UISC == Syllable_Modifier 236def is_CONS_MED(U, UISC, UDI, UGC, AJT): 237 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 238 return (UISC == Consonant_Medial and UGC != Lo or 239 UISC == Consonant_Initial_Postfixed) 240def is_CONS_MOD(U, UISC, UDI, UGC, AJT): 241 return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and 242 not is_SYM_MOD(U, UISC, UDI, UGC, AJT)) 243def is_CONS_SUB(U, UISC, UDI, UGC, AJT): 244 return UISC == Consonant_Subjoined and UGC != Lo 245def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT): 246 return UISC == Consonant_With_Stacker 247def is_HALANT(U, UISC, UDI, UGC, AJT): 248 return (UISC in [Virama, Invisible_Stacker] 249 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT) 250 and not is_SAKOT(U, UISC, UDI, UGC, AJT)) 251def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT): 252 # Split off of HALANT 253 # https://github.com/harfbuzz/harfbuzz/issues/1379 254 return U == 0x1134D 255def is_HALANT_NUM(U, UISC, UDI, UGC, AJT): 256 return UISC == Number_Joiner 257def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT): 258 return UISC == Hieroglyph 259def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT): 260 return UISC == Hieroglyph_Joiner 261def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT): 262 return UISC == Hieroglyph_Segment_Begin 263def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT): 264 return UISC == Hieroglyph_Segment_End 265def is_ZWNJ(U, UISC, UDI, UGC, AJT): 266 return UISC == Non_Joiner 267def is_OTHER(U, UISC, UDI, UGC, AJT): 268 # Also includes BASE_IND, Rsv, and SYM 269 return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other]) 270 and not is_BASE(U, UISC, UDI, UGC, AJT) 271 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT) 272 and not is_CGJ(U, UISC, UDI, UGC, AJT) 273 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT) 274 ) 275def is_REPHA(U, UISC, UDI, UGC, AJT): 276 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 277def is_SAKOT(U, UISC, UDI, UGC, AJT): 278 # Split off of HALANT 279 return U == 0x1A60 280def is_SYM_MOD(U, UISC, UDI, UGC, AJT): 281 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 282def is_VOWEL(U, UISC, UDI, UGC, AJT): 283 # https://github.com/harfbuzz/harfbuzz/issues/376 284 return (UISC == Pure_Killer or 285 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 286def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT): 287 # https://github.com/harfbuzz/harfbuzz/issues/376 288 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 289 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 290 291use_mapping = { 292 'B': is_BASE, 293 'N': is_BASE_NUM, 294 'GB': is_BASE_OTHER, 295 'CGJ': is_CGJ, 296 'F': is_CONS_FINAL, 297 'FM': is_CONS_FINAL_MOD, 298 'M': is_CONS_MED, 299 'CM': is_CONS_MOD, 300 'SUB': is_CONS_SUB, 301 'CS': is_CONS_WITH_STACKER, 302 'H': is_HALANT, 303 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 304 'HN': is_HALANT_NUM, 305 'G': is_HIEROGLYPH, 306 'J': is_HIEROGLYPH_JOINER, 307 'SB': is_HIEROGLYPH_SEGMENT_BEGIN, 308 'SE': is_HIEROGLYPH_SEGMENT_END, 309 'ZWNJ': is_ZWNJ, 310 'O': is_OTHER, 311 'R': is_REPHA, 312 'Sk': is_SAKOT, 313 'SM': is_SYM_MOD, 314 'V': is_VOWEL, 315 'VM': is_VOWEL_MOD, 316} 317 318use_positions = { 319 'F': { 320 'Abv': [Top], 321 'Blw': [Bottom], 322 'Pst': [Right], 323 }, 324 'M': { 325 'Abv': [Top], 326 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right], 327 'Pst': [Right], 328 'Pre': [Left, Top_And_Bottom_And_Left], 329 }, 330 'CM': { 331 'Abv': [Top], 332 'Blw': [Bottom, Overstruck], 333 }, 334 'V': { 335 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 336 'Blw': [Bottom, Overstruck, Bottom_And_Right], 337 'Pst': [Right], 338 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 339 }, 340 'VM': { 341 'Abv': [Top], 342 'Blw': [Bottom, Overstruck], 343 'Pst': [Right], 344 'Pre': [Left], 345 }, 346 'SM': { 347 'Abv': [Top], 348 'Blw': [Bottom], 349 }, 350 'H': None, 351 'HVM': None, 352 'B': None, 353 'FM': { 354 'Abv': [Top], 355 'Blw': [Bottom], 356 'Pst': [Not_Applicable], 357 }, 358 'R': None, 359 'SUB': None, 360} 361 362def map_to_use(data): 363 out = {} 364 items = use_mapping.items() 365 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items(): 366 367 # Resolve Indic_Syllabic_Category 368 369 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 370 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 371 372 # Tibetan: 373 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 374 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 375 376 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 377 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 378 379 # TODO: U+1CED should only be allowed after some of 380 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 381 if U == 0x1CED: UISC = Tone_Mark 382 383 # TODO: https://github.com/microsoft/font-tools/issues/1 384 if U == 0xA982: UISC = Consonant_Succeeding_Repha 385 386 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)] 387 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values) 388 USE = values[0] 389 390 # Resolve Indic_Positional_Category 391 392 # TODO: These should die, but have UIPC in Unicode 13.0.0 393 if U in [0x953, 0x954]: UIPC = Not_Applicable 394 395 # TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0 396 if 0xA926 <= U <= 0xA92A: UIPC = Top 397 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 398 # and https://github.com/harfbuzz/harfbuzz/issues/1631 399 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 400 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 401 402 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 403 # also https://github.com/harfbuzz/harfbuzz/issues/1012 404 if 0x1112A <= U <= 0x1112B: UIPC = Top 405 if 0x11131 <= U <= 0x11132: UIPC = Top 406 407 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or 408 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT) 409 410 pos_mapping = use_positions.get(USE, None) 411 if pos_mapping: 412 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 413 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values) 414 USE = USE + values[0] 415 416 out[U] = (USE, UBlock) 417 return out 418 419defaults = ('O', 'No_Block') 420data = map_to_use(data) 421 422print ("/* == Start of generated table == */") 423print ("/*") 424print (" * The following table is generated by running:") 425print (" *") 426print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) 427print (" *") 428print (" * on files with these headers:") 429print (" *") 430for h in headers: 431 for l in h: 432 print (" * %s" % (l.strip())) 433print (" */") 434print () 435print ("#ifndef HB_OT_SHAPE_COMPLEX_USE_TABLE_HH") 436print ("#define HB_OT_SHAPE_COMPLEX_USE_TABLE_HH") 437print () 438print ('#include "hb.hh"') 439print () 440print ('#include "hb-ot-shape-complex-use-machine.hh"') 441print () 442 443total = 0 444used = 0 445last_block = None 446def print_block (block, start, end, data): 447 global total, used, last_block 448 if block and block != last_block: 449 print () 450 print () 451 print (" /* %s */" % block) 452 if start % 16: 453 print (' ' * (20 + (start % 16 * 6)), end='') 454 num = 0 455 assert start % 8 == 0 456 assert (end+1) % 8 == 0 457 for u in range (start, end+1): 458 if u % 16 == 0: 459 print () 460 print (" /* %04X */" % u, end='') 461 if u in data: 462 num += 1 463 d = data.get (u, defaults) 464 print ("%6s," % d[0], end='') 465 466 total += end - start + 1 467 used += num 468 if block: 469 last_block = block 470 471uu = sorted (data.keys ()) 472 473last = -100000 474num = 0 475offset = 0 476starts = [] 477ends = [] 478print ('#pragma GCC diagnostic push') 479print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 480for k,v in sorted(use_mapping.items()): 481 if k in use_positions and use_positions[k]: continue 482 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:])) 483for k,v in sorted(use_positions.items()): 484 if not v: continue 485 for suf in v.keys(): 486 tag = k + suf 487 print ("#define %s USE(%s)" % (tag, tag)) 488print ('#pragma GCC diagnostic pop') 489print ("") 490print ("static const uint8_t use_table[] = {") 491for u in uu: 492 if u <= last: 493 continue 494 if data[u][0] == 'O': 495 continue 496 block = data[u][1] 497 498 start = u//8*8 499 end = start+1 500 while end in uu and block == data[end][1]: 501 end += 1 502 end = (end-1)//8*8 + 7 503 504 if start != last + 1: 505 if start - last <= 1+16*3: 506 print_block (None, last+1, start-1, data) 507 else: 508 if last >= 0: 509 ends.append (last + 1) 510 offset += ends[-1] - starts[-1] 511 print () 512 print () 513 print ("#define use_offset_0x%04xu %d" % (start, offset)) 514 starts.append (start) 515 516 print_block (block, start, end, data) 517 last = end 518ends.append (last + 1) 519offset += ends[-1] - starts[-1] 520print () 521print () 522occupancy = used * 100. / total 523page_bits = 12 524print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 525print () 526print ("static inline uint8_t") 527print ("hb_use_get_category (hb_codepoint_t u)") 528print ("{") 529print (" switch (u >> %d)" % page_bits) 530print (" {") 531pages = set([u>>page_bits for u in starts+ends]) 532for p in sorted(pages): 533 print (" case 0x%0Xu:" % p) 534 for (start,end) in zip (starts, ends): 535 if p not in [start>>page_bits, end>>page_bits]: continue 536 offset = "use_offset_0x%04xu" % start 537 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 538 print (" break;") 539 print ("") 540print (" default:") 541print (" break;") 542print (" }") 543print (" return USE(O);") 544print ("}") 545print () 546for k in sorted(use_mapping.keys()): 547 if k in use_positions and use_positions[k]: continue 548 print ("#undef %s" % k) 549for k,v in sorted(use_positions.items()): 550 if not v: continue 551 for suf in v.keys(): 552 tag = k + suf 553 print ("#undef %s" % tag) 554print () 555print () 556print ("#endif /* HB_OT_SHAPE_COMPLEX_USE_TABLE_HH */") 557print ("/* == End of generated table == */") 558 559# Maintain at least 50% occupancy in the table */ 560if occupancy < 50: 561 raise Exception ("Table too sparse, please investigate: ", occupancy) 562