1#!/usr/bin/python 2 3import sys 4 5if len (sys.argv) != 5: 6 print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 7 sys.exit (1) 8 9BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] 10 11files = [file (x) for x in sys.argv[1:]] 12 13headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 14headers.append (["UnicodeData.txt does not have a header."]) 15 16data = [{} for f in files] 17values = [{} for f in files] 18for i, f in enumerate (files): 19 for line in f: 20 21 j = line.find ('#') 22 if j >= 0: 23 line = line[:j] 24 25 fields = [x.strip () for x in line.split (';')] 26 if len (fields) == 1: 27 continue 28 29 uu = fields[0].split ('..') 30 start = int (uu[0], 16) 31 if len (uu) == 1: 32 end = start 33 else: 34 end = int (uu[1], 16) 35 36 t = fields[1 if i != 2 else 2] 37 38 for u in range (start, end + 1): 39 data[i][u] = t 40 values[i][t] = values[i].get (t, 0) + end - start + 1 41 42defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 43 44# TODO Characters that are not in Unicode Indic files, but used in USE 45data[0][0x034F] = defaults[0] 46data[0][0x2060] = defaults[0] 47data[0][0x20F0] = defaults[0] 48for u in range (0xFE00, 0xFE0F + 1): 49 data[0][u] = defaults[0] 50 51# Merge data into one dict: 52for i,v in enumerate (defaults): 53 values[i][v] = values[i].get (v, 0) + 1 54combined = {} 55for i,d in enumerate (data): 56 for u,v in d.items (): 57 if i >= 2 and not u in combined: 58 continue 59 if not u in combined: 60 combined[u] = list (defaults) 61 combined[u][i] = v 62combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 63data = combined 64del combined 65num = len (data) 66 67 68property_names = [ 69 # General_Category 70 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 71 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 72 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 73 # Indic_Syllabic_Category 74 'Other', 75 'Bindu', 76 'Visarga', 77 'Avagraha', 78 'Nukta', 79 'Virama', 80 'Pure_Killer', 81 'Invisible_Stacker', 82 'Vowel_Independent', 83 'Vowel_Dependent', 84 'Vowel', 85 'Consonant_Placeholder', 86 'Consonant', 87 'Consonant_Dead', 88 'Consonant_With_Stacker', 89 'Consonant_Prefixed', 90 'Consonant_Preceding_Repha', 91 'Consonant_Succeeding_Repha', 92 'Consonant_Subjoined', 93 'Consonant_Medial', 94 'Consonant_Final', 95 'Consonant_Head_Letter', 96 'Modifying_Letter', 97 'Tone_Letter', 98 'Tone_Mark', 99 'Gemination_Mark', 100 'Cantillation_Mark', 101 'Register_Shifter', 102 'Syllable_Modifier', 103 'Consonant_Killer', 104 'Non_Joiner', 105 'Joiner', 106 'Number_Joiner', 107 'Number', 108 'Brahmi_Joining_Number', 109 # Indic_Positional_Category 110 'Not_Applicable', 111 'Right', 112 'Left', 113 'Visual_Order_Left', 114 'Left_And_Right', 115 'Top', 116 'Bottom', 117 'Top_And_Bottom', 118 'Top_And_Right', 119 'Top_And_Left', 120 'Top_And_Left_And_Right', 121 'Bottom_And_Left', 122 'Bottom_And_Right', 123 'Top_And_Bottom_And_Right', 124 'Overstruck', 125] 126 127class PropertyValue(object): 128 def __init__(self, name_): 129 self.name = name_ 130 def __str__(self): 131 return self.name 132 def __eq__(self, other): 133 return self.name == (other if isinstance(other, basestring) else other.name) 134 def __ne__(self, other): 135 return not (self == other) 136 137property_values = {} 138 139for name in property_names: 140 value = PropertyValue(name) 141 assert value not in property_values 142 assert value not in globals() 143 property_values[name] = value 144globals().update(property_values) 145 146 147def is_BASE(U, UISC, UGC): 148 return (UISC in [Number, Consonant, Consonant_Head_Letter, 149 #SPEC-DRAFT Consonant_Placeholder, 150 Tone_Letter, 151 Vowel_Independent #SPEC-DRAFT 152 ] or 153 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 154 Consonant_Subjoined, Vowel, Vowel_Dependent])) 155def is_BASE_IND(U, UISC, UGC): 156 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 157 return (UISC in [Consonant_Dead, Modifying_Letter] or 158 (UGC == Po and not U in [0x104E, 0x2022, 0x11A3F, 0x11A45]) or 159 False # SPEC-DRAFT-OUTDATED! U == 0x002D 160 ) 161def is_BASE_NUM(U, UISC, UGC): 162 return UISC == Brahmi_Joining_Number 163def is_BASE_OTHER(U, UISC, UGC): 164 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 165 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 166 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 167def is_CGJ(U, UISC, UGC): 168 return U == 0x034F 169def is_CONS_FINAL(U, UISC, UGC): 170 return ((UISC == Consonant_Final and UGC != Lo) or 171 UISC == Consonant_Succeeding_Repha) 172def is_CONS_FINAL_MOD(U, UISC, UGC): 173 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 174 return UISC == Syllable_Modifier 175def is_CONS_MED(U, UISC, UGC): 176 return UISC == Consonant_Medial and UGC != Lo 177def is_CONS_MOD(U, UISC, UGC): 178 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 179def is_CONS_SUB(U, UISC, UGC): 180 #SPEC-DRAFT return UISC == Consonant_Subjoined 181 return UISC == Consonant_Subjoined and UGC != Lo 182def is_CONS_WITH_STACKER(U, UISC, UGC): 183 return UISC == Consonant_With_Stacker 184def is_HALANT(U, UISC, UGC): 185 return UISC in [Virama, Invisible_Stacker] 186def is_HALANT_NUM(U, UISC, UGC): 187 return UISC == Number_Joiner 188def is_ZWNJ(U, UISC, UGC): 189 return UISC == Non_Joiner 190def is_ZWJ(U, UISC, UGC): 191 return UISC == Joiner 192def is_Word_Joiner(U, UISC, UGC): 193 return U == 0x2060 194def is_OTHER(U, UISC, UGC): 195 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 196 return (UISC == Other 197 and not is_SYM_MOD(U, UISC, UGC) 198 and not is_CGJ(U, UISC, UGC) 199 and not is_Word_Joiner(U, UISC, UGC) 200 and not is_VARIATION_SELECTOR(U, UISC, UGC) 201 ) 202def is_Reserved(U, UISC, UGC): 203 return UGC == 'Cn' 204def is_REPHA(U, UISC, UGC): 205 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 206def is_SYM(U, UISC, UGC): 207 if U == 0x25CC: return False #SPEC-DRAFT 208 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 209 return UGC in [So, Sc] 210def is_SYM_MOD(U, UISC, UGC): 211 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 212def is_VARIATION_SELECTOR(U, UISC, UGC): 213 return 0xFE00 <= U <= 0xFE0F 214def is_VOWEL(U, UISC, UGC): 215 # https://github.com/roozbehp/unicode-data/issues/6 216 return (UISC == Pure_Killer or 217 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 218def is_VOWEL_MOD(U, UISC, UGC): 219 # https://github.com/roozbehp/unicode-data/issues/6 220 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 221 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 222 223use_mapping = { 224 'B': is_BASE, 225 'IND': is_BASE_IND, 226 'N': is_BASE_NUM, 227 'GB': is_BASE_OTHER, 228 'CGJ': is_CGJ, 229 'F': is_CONS_FINAL, 230 'FM': is_CONS_FINAL_MOD, 231 'M': is_CONS_MED, 232 'CM': is_CONS_MOD, 233 'SUB': is_CONS_SUB, 234 'CS': is_CONS_WITH_STACKER, 235 'H': is_HALANT, 236 'HN': is_HALANT_NUM, 237 'ZWNJ': is_ZWNJ, 238 'ZWJ': is_ZWJ, 239 'WJ': is_Word_Joiner, 240 'O': is_OTHER, 241 'Rsv': is_Reserved, 242 'R': is_REPHA, 243 'S': is_SYM, 244 'SM': is_SYM_MOD, 245 'VS': is_VARIATION_SELECTOR, 246 'V': is_VOWEL, 247 'VM': is_VOWEL_MOD, 248} 249 250use_positions = { 251 'F': { 252 'Abv': [Top], 253 'Blw': [Bottom], 254 'Pst': [Right], 255 }, 256 'M': { 257 'Abv': [Top], 258 'Blw': [Bottom, Bottom_And_Left], 259 'Pst': [Right], 260 'Pre': [Left], 261 }, 262 'CM': { 263 'Abv': [Top], 264 'Blw': [Bottom], 265 }, 266 'V': { 267 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 268 'Blw': [Bottom, Overstruck, Bottom_And_Right], 269 'Pst': [Right], 270 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 271 }, 272 'VM': { 273 'Abv': [Top], 274 'Blw': [Bottom, Overstruck], 275 'Pst': [Right], 276 'Pre': [Left], 277 }, 278 'SM': { 279 'Abv': [Top], 280 'Blw': [Bottom], 281 }, 282 'H': None, 283 'B': None, 284 'FM': None, 285 'SUB': None, 286} 287 288def map_to_use(data): 289 out = {} 290 items = use_mapping.items() 291 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 292 293 # Resolve Indic_Syllabic_Category 294 295 # TODO: These don't have UISC assigned in Unicode 8.0, but 296 # have UIPC 297 if U == 0x17DD: UISC = Vowel_Dependent 298 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 299 300 # TODO: U+1CED should only be allowed after some of 301 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 302 if U == 0x1CED: UISC = Tone_Mark 303 304 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525 305 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom 306 307 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609 308 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top 309 310 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626 311 if U == 0xA8B4: UISC = Consonant_Medial 312 313 values = [k for k,v in items if v(U,UISC,UGC)] 314 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 315 USE = values[0] 316 317 # Resolve Indic_Positional_Category 318 319 # TODO: Not in Unicode 8.0 yet, but in spec. 320 if U == 0x1B6C: UIPC = Bottom 321 322 # TODO: These should die, but have UIPC in Unicode 8.0 323 if U in [0x953, 0x954]: UIPC = Not_Applicable 324 325 # TODO: In USE's override list but not in Unicode 8.0 326 if U == 0x103C: UIPC = Left 327 328 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0 329 if 0xA926 <= U <= 0xA92A: UIPC = Top 330 if U == 0x111CA: UIPC = Bottom 331 if U == 0x11300: UIPC = Top 332 if U == 0x1133C: UIPC = Bottom 333 if U == 0x1171E: UIPC = Left # Correct?! 334 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right 335 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 336 337 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 338 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 339 340 pos_mapping = use_positions.get(USE, None) 341 if pos_mapping: 342 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 343 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 344 USE = USE + values[0] 345 346 out[U] = (USE, UBlock) 347 return out 348 349defaults = ('O', 'No_Block') 350data = map_to_use(data) 351 352# Remove the outliers 353singles = {} 354for u in [0x034F, 0x25CC, 0x1107F]: 355 singles[u] = data[u] 356 del data[u] 357 358print "/* == Start of generated table == */" 359print "/*" 360print " * The following table is generated by running:" 361print " *" 362print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 363print " *" 364print " * on files with these headers:" 365print " *" 366for h in headers: 367 for l in h: 368 print " * %s" % (l.strip()) 369print " */" 370print 371print '#include "hb-ot-shape-complex-use-private.hh"' 372print 373 374total = 0 375used = 0 376last_block = None 377def print_block (block, start, end, data): 378 global total, used, last_block 379 if block and block != last_block: 380 print 381 print 382 print " /* %s */" % block 383 if start % 16: 384 print ' ' * (20 + (start % 16 * 6)), 385 num = 0 386 assert start % 8 == 0 387 assert (end+1) % 8 == 0 388 for u in range (start, end+1): 389 if u % 16 == 0: 390 print 391 print " /* %04X */" % u, 392 if u in data: 393 num += 1 394 d = data.get (u, defaults) 395 sys.stdout.write ("%6s," % d[0]) 396 397 total += end - start + 1 398 used += num 399 if block: 400 last_block = block 401 402uu = data.keys () 403uu.sort () 404 405last = -100000 406num = 0 407offset = 0 408starts = [] 409ends = [] 410for k,v in sorted(use_mapping.items()): 411 if k in use_positions and use_positions[k]: continue 412 print "#define %s USE_%s /* %s */" % (k, k, v.__name__[3:]) 413for k,v in sorted(use_positions.items()): 414 if not v: continue 415 for suf in v.keys(): 416 tag = k + suf 417 print "#define %s USE_%s" % (tag, tag) 418print "" 419print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" 420for u in uu: 421 if u <= last: 422 continue 423 block = data[u][1] 424 425 start = u//8*8 426 end = start+1 427 while end in uu and block == data[end][1]: 428 end += 1 429 end = (end-1)//8*8 + 7 430 431 if start != last + 1: 432 if start - last <= 1+16*3: 433 print_block (None, last+1, start-1, data) 434 last = start-1 435 else: 436 if last >= 0: 437 ends.append (last + 1) 438 offset += ends[-1] - starts[-1] 439 print 440 print 441 print "#define use_offset_0x%04xu %d" % (start, offset) 442 starts.append (start) 443 444 print_block (block, start, end, data) 445 last = end 446ends.append (last + 1) 447offset += ends[-1] - starts[-1] 448print 449print 450occupancy = used * 100. / total 451page_bits = 12 452print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy) 453print 454print "USE_TABLE_ELEMENT_TYPE" 455print "hb_use_get_categories (hb_codepoint_t u)" 456print "{" 457print " switch (u >> %d)" % page_bits 458print " {" 459pages = set([u>>page_bits for u in starts+ends+singles.keys()]) 460for p in sorted(pages): 461 print " case 0x%0Xu:" % p 462 for (start,end) in zip (starts, ends): 463 if p not in [start>>page_bits, end>>page_bits]: continue 464 offset = "use_offset_0x%04xu" % start 465 print " if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset) 466 for u,d in singles.items (): 467 if p != u>>page_bits: continue 468 print " if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0]) 469 print " break;" 470 print "" 471print " default:" 472print " break;" 473print " }" 474print " return USE_O;" 475print "}" 476print 477for k in sorted(use_mapping.keys()): 478 if k in use_positions and use_positions[k]: continue 479 print "#undef %s" % k 480for k,v in sorted(use_positions.items()): 481 if not v: continue 482 for suf in v.keys(): 483 tag = k + suf 484 print "#undef %s" % tag 485print 486print "/* == End of generated table == */" 487 488# Maintain at least 50% occupancy in the table */ 489if occupancy < 50: 490 raise Exception ("Table too sparse, please investigate: ", occupancy) 491