1#!/usr/bin/env python3 2# flake8: noqa: F821 3 4import logging 5logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 6 7"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt 8 9Input files: 10* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 11* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 12* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt 13* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 14* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 15* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 16* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt 17* ms-use/IndicSyllabicCategory-Additional.txt 18* ms-use/IndicPositionalCategory-Additional.txt 19""" 20 21import sys 22 23if len (sys.argv) != 10: 24 sys.exit (__doc__) 25 26DISABLED_SCRIPTS = { 27 'Arabic', 28 'Lao', 29 'Samaritan', 30 'Syriac', 31 'Thai', 32} 33 34files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 35 36headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4] 37for j in range(7, 9): 38 for line in files[j]: 39 line = line.rstrip() 40 if not line: 41 break 42 headers[j - 1].append(line) 43headers.append (["UnicodeData.txt does not have a header."]) 44 45unicode_data = [{} for _ in files] 46values = [{} for _ in files] 47for i, f in enumerate (files): 48 for line in f: 49 50 j = line.find ('#') 51 if j >= 0: 52 line = line[:j] 53 54 fields = [x.strip () for x in line.split (';')] 55 if len (fields) == 1: 56 continue 57 58 uu = fields[0].split ('..') 59 start = int (uu[0], 16) 60 if len (uu) == 1: 61 end = start 62 else: 63 end = int (uu[1], 16) 64 65 t = fields[1 if i not in [2, 4] else 2] 66 67 if i == 2: 68 t = 'jt_' + t 69 elif i == 3 and t != 'Default_Ignorable_Code_Point': 70 continue 71 elif i == 7 and t == 'Consonant_Final_Modifier': 72 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336 73 t = 'Syllable_Modifier' 74 elif i == 8 and t == 'NA': 75 t = 'Not_Applicable' 76 77 i0 = i if i < 7 else i - 7 78 for u in range (start, end + 1): 79 unicode_data[i0][u] = t 80 values[i0][t] = values[i0].get (t, 0) + end - start + 1 81 82defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown') 83 84# Merge data into one dict: 85for i,v in enumerate (defaults): 86 values[i][v] = values[i].get (v, 0) + 1 87combined = {} 88for i,d in enumerate (unicode_data): 89 for u,v in d.items (): 90 if not u in combined: 91 if i >= 4: 92 continue 93 combined[u] = list (defaults) 94 combined[u][i] = v 95combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS} 96 97 98property_names = [ 99 # General_Category 100 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 101 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 102 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 103 # Indic_Syllabic_Category 104 'Other', 105 'Bindu', 106 'Visarga', 107 'Avagraha', 108 'Nukta', 109 'Virama', 110 'Pure_Killer', 111 'Invisible_Stacker', 112 'Vowel_Independent', 113 'Vowel_Dependent', 114 'Vowel', 115 'Consonant_Placeholder', 116 'Consonant', 117 'Consonant_Dead', 118 'Consonant_With_Stacker', 119 'Consonant_Prefixed', 120 'Consonant_Preceding_Repha', 121 'Consonant_Succeeding_Repha', 122 'Consonant_Subjoined', 123 'Consonant_Medial', 124 'Consonant_Final', 125 'Consonant_Head_Letter', 126 'Consonant_Initial_Postfixed', 127 'Modifying_Letter', 128 'Tone_Letter', 129 'Tone_Mark', 130 'Gemination_Mark', 131 'Cantillation_Mark', 132 'Register_Shifter', 133 'Syllable_Modifier', 134 'Consonant_Killer', 135 'Non_Joiner', 136 'Joiner', 137 'Number_Joiner', 138 'Number', 139 'Brahmi_Joining_Number', 140 'Symbol_Modifier', 141 'Hieroglyph', 142 'Hieroglyph_Joiner', 143 'Hieroglyph_Mark_Begin', 144 'Hieroglyph_Mark_End', 145 'Hieroglyph_Mirror', 146 'Hieroglyph_Modifier', 147 'Hieroglyph_Segment_Begin', 148 'Hieroglyph_Segment_End', 149 # Indic_Positional_Category 150 'Not_Applicable', 151 'Right', 152 'Left', 153 'Visual_Order_Left', 154 'Left_And_Right', 155 'Top', 156 'Bottom', 157 'Top_And_Bottom', 158 'Top_And_Bottom_And_Left', 159 'Top_And_Right', 160 'Top_And_Left', 161 'Top_And_Left_And_Right', 162 'Bottom_And_Left', 163 'Bottom_And_Right', 164 'Top_And_Bottom_And_Right', 165 'Overstruck', 166 # Joining_Type 167 'jt_C', 168 'jt_D', 169 'jt_L', 170 'jt_R', 171 'jt_T', 172 'jt_U', 173 'jt_X', 174] 175 176class PropertyValue(object): 177 def __init__(self, name_): 178 self.name = name_ 179 def __str__(self): 180 return self.name 181 def __eq__(self, other): 182 return self.name == (other if isinstance(other, str) else other.name) 183 def __ne__(self, other): 184 return not (self == other) 185 def __hash__(self): 186 return hash(str(self)) 187 188property_values = {} 189 190for name in property_names: 191 value = PropertyValue(name) 192 assert value not in property_values 193 assert value not in globals() 194 property_values[name] = value 195globals().update(property_values) 196 197 198def is_BASE(U, UISC, UDI, UGC, AJT): 199 return (UISC in [Number, Consonant, Consonant_Head_Letter, 200 Tone_Letter, 201 Vowel_Independent, 202 ] or 203 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484 204 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or 205 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 206 Consonant_Subjoined, Vowel, Vowel_Dependent])) 207def is_BASE_NUM(U, UISC, UDI, UGC, AJT): 208 return UISC == Brahmi_Joining_Number 209def is_BASE_OTHER(U, UISC, UDI, UGC, AJT): 210 if UISC == Consonant_Placeholder: return True 211 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 212def is_CGJ(U, UISC, UDI, UGC, AJT): 213 # Also includes VARIATION_SELECTOR and ZWJ 214 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn] 215def is_CONS_FINAL(U, UISC, UDI, UGC, AJT): 216 return ((UISC == Consonant_Final and UGC != Lo) or 217 UISC == Consonant_Succeeding_Repha) 218def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT): 219 return UISC == Syllable_Modifier 220def is_CONS_MED(U, UISC, UDI, UGC, AJT): 221 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 222 return (UISC == Consonant_Medial and UGC != Lo or 223 UISC == Consonant_Initial_Postfixed) 224def is_CONS_MOD(U, UISC, UDI, UGC, AJT): 225 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 226def is_CONS_SUB(U, UISC, UDI, UGC, AJT): 227 return UISC == Consonant_Subjoined and UGC != Lo 228def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT): 229 return UISC == Consonant_With_Stacker 230def is_HALANT(U, UISC, UDI, UGC, AJT): 231 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT) 232def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT): 233 # Split off of HALANT 234 return U == 0x0DCA 235def is_HALANT_NUM(U, UISC, UDI, UGC, AJT): 236 return UISC == Number_Joiner 237def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT): 238 return UISC == Hieroglyph 239def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT): 240 return UISC == Hieroglyph_Joiner 241def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT): 242 return UISC == Hieroglyph_Mirror 243def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT): 244 return UISC == Hieroglyph_Modifier 245def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT): 246 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin] 247def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT): 248 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End] 249def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT): 250 # Split off of HALANT 251 return (UISC == Invisible_Stacker 252 and not is_SAKOT(U, UISC, UDI, UGC, AJT) 253 ) 254def is_ZWNJ(U, UISC, UDI, UGC, AJT): 255 return UISC == Non_Joiner 256def is_OTHER(U, UISC, UDI, UGC, AJT): 257 # Also includes BASE_IND and SYM 258 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other]) 259 and not is_BASE(U, UISC, UDI, UGC, AJT) 260 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT) 261 and not is_CGJ(U, UISC, UDI, UGC, AJT) 262 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT) 263 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT) 264 ) 265def is_REPHA(U, UISC, UDI, UGC, AJT): 266 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 267def is_SAKOT(U, UISC, UDI, UGC, AJT): 268 # Split off of HALANT 269 return U == 0x1A60 270def is_SYM_MOD(U, UISC, UDI, UGC, AJT): 271 return UISC == Symbol_Modifier 272def is_VOWEL(U, UISC, UDI, UGC, AJT): 273 return (UISC == Pure_Killer or 274 UGC != Lo and UISC in [Vowel, Vowel_Dependent]) 275def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT): 276 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 277 UGC != Lo and UISC == Bindu) 278def is_Word_Joiner(U, UISC, UDI, UGC, AJT): 279 # Also includes Rsv 280 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3] 281 and UISC == Other 282 and not is_CGJ(U, UISC, UDI, UGC, AJT) 283 ) or UGC == Cn 284 285use_mapping = { 286 'B': is_BASE, 287 'N': is_BASE_NUM, 288 'GB': is_BASE_OTHER, 289 'CGJ': is_CGJ, 290 'F': is_CONS_FINAL, 291 'FM': is_CONS_FINAL_MOD, 292 'M': is_CONS_MED, 293 'CM': is_CONS_MOD, 294 'SUB': is_CONS_SUB, 295 'CS': is_CONS_WITH_STACKER, 296 'H': is_HALANT, 297 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 298 'HN': is_HALANT_NUM, 299 'IS': is_INVISIBLE_STACKER, 300 'G': is_HIEROGLYPH, 301 'HM': is_HIEROGLYPH_MOD, 302 'HR': is_HIEROGLYPH_MIRROR, 303 'J': is_HIEROGLYPH_JOINER, 304 'SB': is_HIEROGLYPH_SEGMENT_BEGIN, 305 'SE': is_HIEROGLYPH_SEGMENT_END, 306 'ZWNJ': is_ZWNJ, 307 'O': is_OTHER, 308 'R': is_REPHA, 309 'Sk': is_SAKOT, 310 'SM': is_SYM_MOD, 311 'V': is_VOWEL, 312 'VM': is_VOWEL_MOD, 313 'WJ': is_Word_Joiner, 314} 315 316use_positions = { 317 'F': { 318 'Abv': [Top], 319 'Blw': [Bottom], 320 'Pst': [Right], 321 }, 322 'M': { 323 'Abv': [Top], 324 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right], 325 'Pst': [Right], 326 'Pre': [Left, Top_And_Bottom_And_Left], 327 }, 328 'CM': { 329 'Abv': [Top], 330 'Blw': [Bottom, Overstruck], 331 }, 332 'V': { 333 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 334 'Blw': [Bottom, Overstruck, Bottom_And_Right], 335 'Pst': [Right], 336 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 337 }, 338 'VM': { 339 'Abv': [Top], 340 'Blw': [Bottom, Overstruck], 341 'Pst': [Right], 342 'Pre': [Left], 343 }, 344 'SM': { 345 'Abv': [Top], 346 'Blw': [Bottom], 347 }, 348 'H': None, 349 'HM': None, 350 'HR': None, 351 'HVM': None, 352 'IS': None, 353 'B': None, 354 'FM': { 355 'Abv': [Top], 356 'Blw': [Bottom], 357 'Pst': [Not_Applicable], 358 }, 359 'R': None, 360 'SUB': None, 361} 362 363def map_to_use(data): 364 out = {} 365 items = use_mapping.items() 366 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items(): 367 368 # Resolve Indic_Syllabic_Category 369 370 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 371 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 372 373 # Tibetan: 374 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 375 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 376 377 # TODO: U+1CED should only be allowed after some of 378 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 379 if U == 0x1CED: UISC = Tone_Mark 380 381 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)] 382 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values) 383 USE = values[0] 384 385 # Resolve Indic_Positional_Category 386 387 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 388 # and https://github.com/harfbuzz/harfbuzz/issues/1631 389 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 390 391 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or 392 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT) 393 394 pos_mapping = use_positions.get(USE, None) 395 if pos_mapping: 396 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 397 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values) 398 USE = USE + values[0] 399 400 out[U] = (USE, UBlock) 401 return out 402 403use_data = map_to_use(combined) 404 405print ("/* == Start of generated table == */") 406print ("/*") 407print (" * The following table is generated by running:") 408print (" *") 409print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) 410print (" *") 411print (" * on files with these headers:") 412print (" *") 413for h in headers: 414 for l in h: 415 print (" * %s" % (l.strip())) 416print (" */") 417print () 418print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH") 419print ("#define HB_OT_SHAPER_USE_TABLE_HH") 420print () 421print ('#include "hb.hh"') 422print () 423print ('#include "hb-ot-shaper-use-machine.hh"') 424print () 425 426total = 0 427used = 0 428last_block = None 429def print_block (block, start, end, use_data): 430 global total, used, last_block 431 if block and block != last_block: 432 print () 433 print () 434 print (" /* %s */" % block) 435 if start % 16: 436 print (' ' * (20 + (start % 16 * 6)), end='') 437 num = 0 438 assert start % 8 == 0 439 assert (end+1) % 8 == 0 440 for u in range (start, end+1): 441 if u % 16 == 0: 442 print () 443 print (" /* %04X */" % u, end='') 444 if u in use_data: 445 num += 1 446 d = use_data.get (u) 447 if d is not None: 448 d = d[0] 449 elif u in unicode_data[4]: 450 d = 'O' 451 else: 452 d = 'WJ' 453 print ("%6s," % d, end='') 454 455 total += end - start + 1 456 used += num 457 if block: 458 last_block = block 459 460uu = sorted (use_data.keys ()) 461 462last = -100000 463num = 0 464offset = 0 465starts = [] 466ends = [] 467print ('#pragma GCC diagnostic push') 468print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 469for k,v in sorted(use_mapping.items()): 470 if k in use_positions and use_positions[k]: continue 471 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:])) 472for k,v in sorted(use_positions.items()): 473 if not v: continue 474 for suf in v.keys(): 475 tag = k + suf 476 print ("#define %s USE(%s)" % (tag, tag)) 477print ('#pragma GCC diagnostic pop') 478print ("") 479 480 481import packTab 482data = {u:v[0] for u,v in use_data.items()} 483 484DEFAULT = 5 485COMPACT = 9 486for compression in (DEFAULT, COMPACT): 487 488 logging.info(' Compression=%d:' % compression) 489 print() 490 if compression == DEFAULT: 491 print('#ifndef HB_OPTIMIZE_SIZE') 492 elif compression == COMPACT: 493 print('#else') 494 else: 495 assert False 496 print() 497 498 code = packTab.Code('hb_use') 499 sol = packTab.pack_table(data, compression=compression, default='O') 500 logging.info(' FullCost=%d' % (sol.fullCost)) 501 sol.genCode(code, f'get_category') 502 code.print_c(linkage='static inline') 503 print () 504 505print('#endif') 506 507print () 508for k in sorted(use_mapping.keys()): 509 if k in use_positions and use_positions[k]: continue 510 print ("#undef %s" % k) 511for k,v in sorted(use_positions.items()): 512 if not v: continue 513 for suf in v.keys(): 514 tag = k + suf 515 print ("#undef %s" % tag) 516print () 517print () 518print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */") 519print ("/* == End of generated table == */") 520