1#!/usr/bin/env python 2# flake8: noqa 3 4from __future__ import print_function, division, absolute_import 5 6import io 7import sys 8 9if len (sys.argv) != 5: 10 print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr) 11 sys.exit (1) 12 13BLACKLISTED_BLOCKS = ["Thai", "Lao"] 14 15files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 16 17headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 18headers.append (["UnicodeData.txt does not have a header."]) 19 20data = [{} for f in files] 21values = [{} for f in files] 22for i, f in enumerate (files): 23 for line in f: 24 25 j = line.find ('#') 26 if j >= 0: 27 line = line[:j] 28 29 fields = [x.strip () for x in line.split (';')] 30 if len (fields) == 1: 31 continue 32 33 uu = fields[0].split ('..') 34 start = int (uu[0], 16) 35 if len (uu) == 1: 36 end = start 37 else: 38 end = int (uu[1], 16) 39 40 t = fields[1 if i != 2 else 2] 41 42 for u in range (start, end + 1): 43 data[i][u] = t 44 values[i][t] = values[i].get (t, 0) + end - start + 1 45 46defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 47 48# TODO Characters that are not in Unicode Indic files, but used in USE 49data[0][0x034F] = defaults[0] 50data[0][0x1B61] = defaults[0] 51data[0][0x1B63] = defaults[0] 52data[0][0x1B64] = defaults[0] 53data[0][0x1B65] = defaults[0] 54data[0][0x1B66] = defaults[0] 55data[0][0x1B67] = defaults[0] 56data[0][0x1B69] = defaults[0] 57data[0][0x1B6A] = defaults[0] 58data[0][0x2060] = defaults[0] 59# TODO https://github.com/harfbuzz/harfbuzz/pull/1685 60data[0][0x1B5B] = 'Consonant_Placeholder' 61data[0][0x1B5C] = 'Consonant_Placeholder' 62data[0][0x1B5F] = 'Consonant_Placeholder' 63data[0][0x1B62] = 'Consonant_Placeholder' 64data[0][0x1B68] = 'Consonant_Placeholder' 65# TODO https://github.com/harfbuzz/harfbuzz/issues/1035 66data[0][0x11C44] = 'Consonant_Placeholder' 67data[0][0x11C45] = 'Consonant_Placeholder' 68# TODO https://github.com/harfbuzz/harfbuzz/pull/1399 69data[0][0x111C8] = 'Consonant_Placeholder' 70for u in range (0xFE00, 0xFE0F + 1): 71 data[0][u] = defaults[0] 72 73# Merge data into one dict: 74for i,v in enumerate (defaults): 75 values[i][v] = values[i].get (v, 0) + 1 76combined = {} 77for i,d in enumerate (data): 78 for u,v in d.items (): 79 if i >= 2 and not u in combined: 80 continue 81 if not u in combined: 82 combined[u] = list (defaults) 83 combined[u][i] = v 84combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 85data = combined 86del combined 87num = len (data) 88 89 90property_names = [ 91 # General_Category 92 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 93 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 94 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 95 # Indic_Syllabic_Category 96 'Other', 97 'Bindu', 98 'Visarga', 99 'Avagraha', 100 'Nukta', 101 'Virama', 102 'Pure_Killer', 103 'Invisible_Stacker', 104 'Vowel_Independent', 105 'Vowel_Dependent', 106 'Vowel', 107 'Consonant_Placeholder', 108 'Consonant', 109 'Consonant_Dead', 110 'Consonant_With_Stacker', 111 'Consonant_Prefixed', 112 'Consonant_Preceding_Repha', 113 'Consonant_Succeeding_Repha', 114 'Consonant_Subjoined', 115 'Consonant_Medial', 116 'Consonant_Final', 117 'Consonant_Head_Letter', 118 'Consonant_Initial_Postfixed', 119 'Modifying_Letter', 120 'Tone_Letter', 121 'Tone_Mark', 122 'Gemination_Mark', 123 'Cantillation_Mark', 124 'Register_Shifter', 125 'Syllable_Modifier', 126 'Consonant_Killer', 127 'Non_Joiner', 128 'Joiner', 129 'Number_Joiner', 130 'Number', 131 'Brahmi_Joining_Number', 132 # Indic_Positional_Category 133 'Not_Applicable', 134 'Right', 135 'Left', 136 'Visual_Order_Left', 137 'Left_And_Right', 138 'Top', 139 'Bottom', 140 'Top_And_Bottom', 141 'Top_And_Right', 142 'Top_And_Left', 143 'Top_And_Left_And_Right', 144 'Bottom_And_Left', 145 'Bottom_And_Right', 146 'Top_And_Bottom_And_Right', 147 'Overstruck', 148] 149 150try: 151 basestring 152except NameError: 153 basestring = str 154 155class PropertyValue(object): 156 def __init__(self, name_): 157 self.name = name_ 158 def __str__(self): 159 return self.name 160 def __eq__(self, other): 161 return self.name == (other if isinstance(other, basestring) else other.name) 162 def __ne__(self, other): 163 return not (self == other) 164 def __hash__(self): 165 return hash(str(self)) 166 167property_values = {} 168 169for name in property_names: 170 value = PropertyValue(name) 171 assert value not in property_values 172 assert value not in globals() 173 property_values[name] = value 174globals().update(property_values) 175 176 177def is_BASE(U, UISC, UGC): 178 return (UISC in [Number, Consonant, Consonant_Head_Letter, 179 #SPEC-DRAFT Consonant_Placeholder, 180 Tone_Letter, 181 Vowel_Independent #SPEC-DRAFT 182 ] or 183 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 184 Consonant_Subjoined, Vowel, Vowel_Dependent])) 185def is_BASE_IND(U, UISC, UGC): 186 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 187 return (UISC in [Consonant_Dead, Modifying_Letter] or 188 (UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or 189 False # SPEC-DRAFT-OUTDATED! U == 0x002D 190 ) 191def is_BASE_NUM(U, UISC, UGC): 192 return UISC == Brahmi_Joining_Number 193def is_BASE_OTHER(U, UISC, UGC): 194 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 195 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 196 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 197def is_CGJ(U, UISC, UGC): 198 return U == 0x034F 199def is_CONS_FINAL(U, UISC, UGC): 200 return ((UISC == Consonant_Final and UGC != Lo) or 201 UISC == Consonant_Succeeding_Repha) 202def is_CONS_FINAL_MOD(U, UISC, UGC): 203 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 204 return UISC == Syllable_Modifier 205def is_CONS_MED(U, UISC, UGC): 206 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 207 return (UISC == Consonant_Medial and UGC != Lo or 208 UISC == Consonant_Initial_Postfixed) 209def is_CONS_MOD(U, UISC, UGC): 210 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 211def is_CONS_SUB(U, UISC, UGC): 212 #SPEC-DRAFT return UISC == Consonant_Subjoined 213 return UISC == Consonant_Subjoined and UGC != Lo 214def is_CONS_WITH_STACKER(U, UISC, UGC): 215 return UISC == Consonant_With_Stacker 216def is_HALANT(U, UISC, UGC): 217 return (UISC in [Virama, Invisible_Stacker] 218 and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC) 219 and not is_SAKOT(U, UISC, UGC)) 220def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC): 221 # https://github.com/harfbuzz/harfbuzz/issues/1102 222 # https://github.com/harfbuzz/harfbuzz/issues/1379 223 return U in [0x11046, 0x1134D] 224def is_HALANT_NUM(U, UISC, UGC): 225 return UISC == Number_Joiner 226def is_ZWNJ(U, UISC, UGC): 227 return UISC == Non_Joiner 228def is_ZWJ(U, UISC, UGC): 229 return UISC == Joiner 230def is_Word_Joiner(U, UISC, UGC): 231 return U == 0x2060 232def is_OTHER(U, UISC, UGC): 233 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 234 return (UISC == Other 235 and not is_SYM(U, UISC, UGC) 236 and not is_SYM_MOD(U, UISC, UGC) 237 and not is_CGJ(U, UISC, UGC) 238 and not is_Word_Joiner(U, UISC, UGC) 239 and not is_VARIATION_SELECTOR(U, UISC, UGC) 240 ) 241def is_Reserved(U, UISC, UGC): 242 return UGC == 'Cn' 243def is_REPHA(U, UISC, UGC): 244 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 245def is_SAKOT(U, UISC, UGC): 246 return U == 0x1A60 247def is_SYM(U, UISC, UGC): 248 if U == 0x25CC: return False #SPEC-DRAFT 249 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 250 return UGC in [So, Sc] and U not in [0x1B62, 0x1B68] 251def is_SYM_MOD(U, UISC, UGC): 252 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 253def is_VARIATION_SELECTOR(U, UISC, UGC): 254 return 0xFE00 <= U <= 0xFE0F 255def is_VOWEL(U, UISC, UGC): 256 # https://github.com/harfbuzz/harfbuzz/issues/376 257 return (UISC == Pure_Killer or 258 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 259def is_VOWEL_MOD(U, UISC, UGC): 260 # https://github.com/harfbuzz/harfbuzz/issues/376 261 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 262 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 263 264use_mapping = { 265 'B': is_BASE, 266 'IND': is_BASE_IND, 267 'N': is_BASE_NUM, 268 'GB': is_BASE_OTHER, 269 'CGJ': is_CGJ, 270 'F': is_CONS_FINAL, 271 'FM': is_CONS_FINAL_MOD, 272 'M': is_CONS_MED, 273 'CM': is_CONS_MOD, 274 'SUB': is_CONS_SUB, 275 'CS': is_CONS_WITH_STACKER, 276 'H': is_HALANT, 277 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 278 'HN': is_HALANT_NUM, 279 'ZWNJ': is_ZWNJ, 280 'ZWJ': is_ZWJ, 281 'WJ': is_Word_Joiner, 282 'O': is_OTHER, 283 'Rsv': is_Reserved, 284 'R': is_REPHA, 285 'S': is_SYM, 286 'Sk': is_SAKOT, 287 'SM': is_SYM_MOD, 288 'VS': is_VARIATION_SELECTOR, 289 'V': is_VOWEL, 290 'VM': is_VOWEL_MOD, 291} 292 293use_positions = { 294 'F': { 295 'Abv': [Top], 296 'Blw': [Bottom], 297 'Pst': [Right], 298 }, 299 'M': { 300 'Abv': [Top], 301 'Blw': [Bottom, Bottom_And_Left], 302 'Pst': [Right], 303 'Pre': [Left], 304 }, 305 'CM': { 306 'Abv': [Top], 307 'Blw': [Bottom], 308 }, 309 'V': { 310 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 311 'Blw': [Bottom, Overstruck, Bottom_And_Right], 312 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 313 'Pre': [Left], 314 }, 315 'VM': { 316 'Abv': [Top], 317 'Blw': [Bottom, Overstruck], 318 'Pst': [Right], 319 'Pre': [Left], 320 }, 321 'SM': { 322 'Abv': [Top], 323 'Blw': [Bottom], 324 }, 325 'H': None, 326 'HVM': None, 327 'B': None, 328 'FM': { 329 'Abv': [Top], 330 'Blw': [Bottom], 331 'Pst': [Not_Applicable], 332 }, 333 'SUB': None, 334} 335 336def map_to_use(data): 337 out = {} 338 items = use_mapping.items() 339 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 340 341 # Resolve Indic_Syllabic_Category 342 343 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC 344 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 345 346 # Tibetan: 347 # TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC 348 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 349 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark 350 # Overrides to allow NFC order matching syllable 351 # https://github.com/harfbuzz/harfbuzz/issues/1012 352 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC): 353 if UIPC == Top: 354 UIPC = Bottom 355 356 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 357 # also https://github.com/harfbuzz/harfbuzz/issues/1012 358 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC): 359 if UIPC == Top: 360 UIPC = Bottom 361 elif UIPC == Bottom: 362 UIPC = Top 363 364 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 365 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 366 367 # TODO: U+1CED should only be allowed after some of 368 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 369 if U == 0x1CED: UISC = Tone_Mark 370 371 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105 372 if U == 0x11134: UISC = Gemination_Mark 373 374 values = [k for k,v in items if v(U,UISC,UGC)] 375 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 376 USE = values[0] 377 378 # Resolve Indic_Positional_Category 379 380 # TODO: These should die, but have UIPC in Unicode 12.0 381 if U in [0x953, 0x954]: UIPC = Not_Applicable 382 383 # TODO: In USE's override list but not in Unicode 12.0 384 if U == 0x103C: UIPC = Left 385 386 # TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0 387 if 0xA926 <= U <= 0xA92A: UIPC = Top 388 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 389 # and https://github.com/harfbuzz/harfbuzz/issues/1631 390 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 391 if U == 0x1171E: UIPC = Left 392 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 393 394 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 395 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 396 397 pos_mapping = use_positions.get(USE, None) 398 if pos_mapping: 399 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 400 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 401 USE = USE + values[0] 402 403 out[U] = (USE, UBlock) 404 return out 405 406defaults = ('O', 'No_Block') 407data = map_to_use(data) 408 409print ("/* == Start of generated table == */") 410print ("/*") 411print (" * The following table is generated by running:") 412print (" *") 413print (" * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt") 414print (" *") 415print (" * on files with these headers:") 416print (" *") 417for h in headers: 418 for l in h: 419 print (" * %s" % (l.strip())) 420print (" */") 421print () 422print ('#include "hb-ot-shape-complex-use.hh"') 423print () 424 425total = 0 426used = 0 427last_block = None 428def print_block (block, start, end, data): 429 global total, used, last_block 430 if block and block != last_block: 431 print () 432 print () 433 print (" /* %s */" % block) 434 if start % 16: 435 print (' ' * (20 + (start % 16 * 6)), end='') 436 num = 0 437 assert start % 8 == 0 438 assert (end+1) % 8 == 0 439 for u in range (start, end+1): 440 if u % 16 == 0: 441 print () 442 print (" /* %04X */" % u, end='') 443 if u in data: 444 num += 1 445 d = data.get (u, defaults) 446 print ("%6s," % d[0], end='') 447 448 total += end - start + 1 449 used += num 450 if block: 451 last_block = block 452 453uu = sorted (data.keys ()) 454 455last = -100000 456num = 0 457offset = 0 458starts = [] 459ends = [] 460print ('#pragma GCC diagnostic push') 461print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 462for k,v in sorted(use_mapping.items()): 463 if k in use_positions and use_positions[k]: continue 464 print ("#define %s USE_%s /* %s */" % (k, k, v.__name__[3:])) 465for k,v in sorted(use_positions.items()): 466 if not v: continue 467 for suf in v.keys(): 468 tag = k + suf 469 print ("#define %s USE_%s" % (tag, tag)) 470print ('#pragma GCC diagnostic pop') 471print ("") 472print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {") 473for u in uu: 474 if u <= last: 475 continue 476 block = data[u][1] 477 478 start = u//8*8 479 end = start+1 480 while end in uu and block == data[end][1]: 481 end += 1 482 end = (end-1)//8*8 + 7 483 484 if start != last + 1: 485 if start - last <= 1+16*3: 486 print_block (None, last+1, start-1, data) 487 last = start-1 488 else: 489 if last >= 0: 490 ends.append (last + 1) 491 offset += ends[-1] - starts[-1] 492 print () 493 print () 494 print ("#define use_offset_0x%04xu %d" % (start, offset)) 495 starts.append (start) 496 497 print_block (block, start, end, data) 498 last = end 499ends.append (last + 1) 500offset += ends[-1] - starts[-1] 501print () 502print () 503occupancy = used * 100. / total 504page_bits = 12 505print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 506print () 507print ("USE_TABLE_ELEMENT_TYPE") 508print ("hb_use_get_category (hb_codepoint_t u)") 509print ("{") 510print (" switch (u >> %d)" % page_bits) 511print (" {") 512pages = set([u>>page_bits for u in starts+ends]) 513for p in sorted(pages): 514 print (" case 0x%0Xu:" % p) 515 for (start,end) in zip (starts, ends): 516 if p not in [start>>page_bits, end>>page_bits]: continue 517 offset = "use_offset_0x%04xu" % start 518 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 519 print (" break;") 520 print ("") 521print (" default:") 522print (" break;") 523print (" }") 524print (" return USE_O;") 525print ("}") 526print () 527for k in sorted(use_mapping.keys()): 528 if k in use_positions and use_positions[k]: continue 529 print ("#undef %s" % k) 530for k,v in sorted(use_positions.items()): 531 if not v: continue 532 for suf in v.keys(): 533 tag = k + suf 534 print ("#undef %s" % tag) 535print () 536print ("/* == End of generated table == */") 537 538# Maintain at least 50% occupancy in the table */ 539if occupancy < 50: 540 raise Exception ("Table too sparse, please investigate: ", occupancy) 541