1#!/usr/bin/python 2 3import sys 4 5if len (sys.argv) != 5: 6 print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 7 sys.exit (1) 8 9BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] 10 11files = [file (x) for x in sys.argv[1:]] 12 13headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 14headers.append (["UnicodeData.txt does not have a header."]) 15 16data = [{} for f in files] 17values = [{} for f in files] 18for i, f in enumerate (files): 19 for line in f: 20 21 j = line.find ('#') 22 if j >= 0: 23 line = line[:j] 24 25 fields = [x.strip () for x in line.split (';')] 26 if len (fields) == 1: 27 continue 28 29 uu = fields[0].split ('..') 30 start = int (uu[0], 16) 31 if len (uu) == 1: 32 end = start 33 else: 34 end = int (uu[1], 16) 35 36 t = fields[1 if i != 2 else 2] 37 38 for u in range (start, end + 1): 39 data[i][u] = t 40 values[i][t] = values[i].get (t, 0) + end - start + 1 41 42defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 43 44# TODO Characters that are not in Unicode Indic files, but used in USE 45data[0][0x034F] = defaults[0] 46data[0][0x2060] = defaults[0] 47for u in range (0xFE00, 0xFE0F + 1): 48 data[0][u] = defaults[0] 49 50# Merge data into one dict: 51for i,v in enumerate (defaults): 52 values[i][v] = values[i].get (v, 0) + 1 53combined = {} 54for i,d in enumerate (data): 55 for u,v in d.items (): 56 if i >= 2 and not u in combined: 57 continue 58 if not u in combined: 59 combined[u] = list (defaults) 60 combined[u][i] = v 61combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 62data = combined 63del combined 64num = len (data) 65 66 67property_names = [ 68 # General_Category 69 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 70 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 71 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 72 # Indic_Syllabic_Category 73 'Other', 74 'Bindu', 75 'Visarga', 76 'Avagraha', 77 'Nukta', 78 'Virama', 79 'Pure_Killer', 80 'Invisible_Stacker', 81 'Vowel_Independent', 82 'Vowel_Dependent', 83 'Vowel', 84 'Consonant_Placeholder', 85 'Consonant', 86 'Consonant_Dead', 87 'Consonant_With_Stacker', 88 'Consonant_Prefixed', 89 'Consonant_Preceding_Repha', 90 'Consonant_Succeeding_Repha', 91 'Consonant_Subjoined', 92 'Consonant_Medial', 93 'Consonant_Final', 94 'Consonant_Head_Letter', 95 'Modifying_Letter', 96 'Tone_Letter', 97 'Tone_Mark', 98 'Gemination_Mark', 99 'Cantillation_Mark', 100 'Register_Shifter', 101 'Syllable_Modifier', 102 'Consonant_Killer', 103 'Non_Joiner', 104 'Joiner', 105 'Number_Joiner', 106 'Number', 107 'Brahmi_Joining_Number', 108 # Indic_Positional_Category 109 'Not_Applicable', 110 'Right', 111 'Left', 112 'Visual_Order_Left', 113 'Left_And_Right', 114 'Top', 115 'Bottom', 116 'Top_And_Bottom', 117 'Top_And_Right', 118 'Top_And_Left', 119 'Top_And_Left_And_Right', 120 'Bottom_And_Right', 121 'Top_And_Bottom_And_Right', 122 'Overstruck', 123] 124 125class PropertyValue(object): 126 def __init__(self, name_): 127 self.name = name_ 128 def __str__(self): 129 return self.name 130 def __eq__(self, other): 131 return self.name == (other if isinstance(other, basestring) else other.name) 132 def __ne__(self, other): 133 return not (self == other) 134 135property_values = {} 136 137for name in property_names: 138 value = PropertyValue(name) 139 assert value not in property_values 140 assert value not in globals() 141 property_values[name] = value 142globals().update(property_values) 143 144 145def is_BASE(U, UISC, UGC): 146 return (UISC in [Number, Consonant, Consonant_Head_Letter, 147 #SPEC-DRAFT Consonant_Placeholder, 148 Tone_Letter, 149 Vowel_Independent #SPEC-DRAFT 150 ] or 151 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 152 Consonant_Subjoined, Vowel, Vowel_Dependent])) 153def is_BASE_IND(U, UISC, UGC): 154 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 155 return (UISC in [Consonant_Dead, Modifying_Letter] or 156 (UGC == Po and not U in [0x104E, 0x2022]) or 157 False # SPEC-DRAFT-OUTDATED! U == 0x002D 158 ) 159def is_BASE_NUM(U, UISC, UGC): 160 return UISC == Brahmi_Joining_Number 161def is_BASE_OTHER(U, UISC, UGC): 162 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 163 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 164 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 165def is_CGJ(U, UISC, UGC): 166 return U == 0x034F 167def is_CONS_FINAL(U, UISC, UGC): 168 return ((UISC == Consonant_Final and UGC != Lo) or 169 UISC == Consonant_Succeeding_Repha) 170def is_CONS_FINAL_MOD(U, UISC, UGC): 171 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 172 return UISC == Syllable_Modifier 173def is_CONS_MED(U, UISC, UGC): 174 return UISC == Consonant_Medial and UGC != Lo 175def is_CONS_MOD(U, UISC, UGC): 176 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 177def is_CONS_SUB(U, UISC, UGC): 178 #SPEC-DRAFT return UISC == Consonant_Subjoined 179 return UISC == Consonant_Subjoined and UGC != Lo 180def is_HALANT(U, UISC, UGC): 181 return UISC in [Virama, Invisible_Stacker] 182def is_HALANT_NUM(U, UISC, UGC): 183 return UISC == Number_Joiner 184def is_ZWNJ(U, UISC, UGC): 185 return UISC == Non_Joiner 186def is_ZWJ(U, UISC, UGC): 187 return UISC == Joiner 188def is_Word_Joiner(U, UISC, UGC): 189 return U == 0x2060 190def is_OTHER(U, UISC, UGC): 191 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 192 return (UISC == Other 193 and not is_SYM_MOD(U, UISC, UGC) 194 and not is_CGJ(U, UISC, UGC) 195 and not is_Word_Joiner(U, UISC, UGC) 196 and not is_VARIATION_SELECTOR(U, UISC, UGC) 197 ) 198def is_Reserved(U, UISC, UGC): 199 return UGC == 'Cn' 200def is_REPHA(U, UISC, UGC): 201 #return UISC == Consonant_Preceding_Repha 202 #SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed 203 return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed] 204def is_SYM(U, UISC, UGC): 205 if U == 0x25CC: return False #SPEC-DRAFT 206 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 207 return UGC in [So, Sc] 208def is_SYM_MOD(U, UISC, UGC): 209 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 210def is_VARIATION_SELECTOR(U, UISC, UGC): 211 return 0xFE00 <= U <= 0xFE0F 212def is_VOWEL(U, UISC, UGC): 213 return (UISC == Pure_Killer or 214 (UGC != Lo and UISC in [Vowel, Vowel_Dependent])) 215def is_VOWEL_MOD(U, UISC, UGC): 216 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 217 (UGC != Lo and UISC == Bindu)) 218 219use_mapping = { 220 'B': is_BASE, 221 'IND': is_BASE_IND, 222 'N': is_BASE_NUM, 223 'GB': is_BASE_OTHER, 224 'CGJ': is_CGJ, 225 'F': is_CONS_FINAL, 226 'FM': is_CONS_FINAL_MOD, 227 'M': is_CONS_MED, 228 'CM': is_CONS_MOD, 229 'SUB': is_CONS_SUB, 230 'H': is_HALANT, 231 'HN': is_HALANT_NUM, 232 'ZWNJ': is_ZWNJ, 233 'ZWJ': is_ZWJ, 234 'WJ': is_Word_Joiner, 235 'O': is_OTHER, 236 'Rsv': is_Reserved, 237 'R': is_REPHA, 238 'S': is_SYM, 239 'SM': is_SYM_MOD, 240 'VS': is_VARIATION_SELECTOR, 241 'V': is_VOWEL, 242 'VM': is_VOWEL_MOD, 243} 244 245use_positions = { 246 'F': { 247 'Abv': [Top], 248 'Blw': [Bottom], 249 'Pst': [Right], 250 }, 251 'M': { 252 'Abv': [Top], 253 'Blw': [Bottom], 254 'Pst': [Right], 255 'Pre': [Left], 256 }, 257 'CM': { 258 'Abv': [Top], 259 'Blw': [Bottom], 260 }, 261 'V': { 262 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 263 'Blw': [Bottom, Overstruck, Bottom_And_Right], 264 'Pst': [Right], 265 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 266 }, 267 'VM': { 268 'Abv': [Top], 269 'Blw': [Bottom, Overstruck], 270 'Pst': [Right], 271 'Pre': [Left], 272 }, 273 'SM': { 274 'Abv': [Top], 275 'Blw': [Bottom], 276 }, 277 'H': None, 278 'B': None, 279 'FM': None, 280 'SUB': None, 281} 282 283def map_to_use(data): 284 out = {} 285 items = use_mapping.items() 286 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 287 288 # Resolve Indic_Syllabic_Category 289 290 # TODO: These don't have UISC assigned in Unicode 8.0, but 291 # have UIPC 292 if U == 0x17DD: UISC = Vowel_Dependent 293 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 294 295 # TODO: U+1CED should only be allowed after some of 296 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 297 if U == 0x1CED: UISC = Tone_Mark 298 299 evals = [(k, v(U,UISC,UGC)) for k,v in items] 300 values = [k for k,v in evals if v] 301 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 302 USE = values[0] 303 304 # Resolve Indic_Positional_Category 305 306 # TODO: Not in Unicode 8.0 yet, but in spec. 307 if U == 0x1B6C: UIPC = Bottom 308 309 # TODO: These should die, but have UIPC in Unicode 8.0 310 if U in [0x953, 0x954]: UIPC = Not_Applicable 311 312 # TODO: In USE's override list but not in Unicode 8.0 313 if U == 0x103C: UIPC = Left 314 315 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0 316 if 0xA926 <= U <= 0xA92A: UIPC = Top 317 if U == 0x111CA: UIPC = Bottom 318 if U == 0x11300: UIPC = Top 319 if U == 0x1133C: UIPC = Bottom 320 if U == 0x1171E: UIPC = Left # Correct?! 321 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right 322 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 323 324 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 325 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 326 327 pos_mapping = use_positions.get(USE, None) 328 if pos_mapping: 329 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 330 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 331 USE = USE + values[0] 332 333 out[U] = (USE, UBlock) 334 return out 335 336defaults = ('O', 'No_Block') 337data = map_to_use(data) 338 339# Remove the outliers 340singles = {} 341for u in [0x034F, 0x25CC, 0x1107F]: 342 singles[u] = data[u] 343 del data[u] 344 345print "/* == Start of generated table == */" 346print "/*" 347print " * The following table is generated by running:" 348print " *" 349print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 350print " *" 351print " * on files with these headers:" 352print " *" 353for h in headers: 354 for l in h: 355 print " * %s" % (l.strip()) 356print " */" 357print 358print '#include "hb-ot-shape-complex-use-private.hh"' 359print 360 361total = 0 362used = 0 363last_block = None 364def print_block (block, start, end, data): 365 global total, used, last_block 366 if block and block != last_block: 367 print 368 print 369 print " /* %s */" % block 370 if start % 16: 371 print ' ' * (20 + (start % 16 * 6)), 372 num = 0 373 assert start % 8 == 0 374 assert (end+1) % 8 == 0 375 for u in range (start, end+1): 376 if u % 16 == 0: 377 print 378 print " /* %04X */" % u, 379 if u in data: 380 num += 1 381 d = data.get (u, defaults) 382 sys.stdout.write ("%6s," % d[0]) 383 384 total += end - start + 1 385 used += num 386 if block: 387 last_block = block 388 389uu = data.keys () 390uu.sort () 391 392last = -100000 393num = 0 394offset = 0 395starts = [] 396ends = [] 397for k,v in sorted(use_mapping.items()): 398 if k in use_positions and use_positions[k]: continue 399 print "#define %s USE_%s /* %s */" % (k, k, v.__name__[3:]) 400for k,v in sorted(use_positions.items()): 401 if not v: continue 402 for suf in v.keys(): 403 tag = k + suf 404 print "#define %s USE_%s" % (tag, tag) 405print "" 406print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" 407for u in uu: 408 if u <= last: 409 continue 410 block = data[u][1] 411 412 start = u//8*8 413 end = start+1 414 while end in uu and block == data[end][1]: 415 end += 1 416 end = (end-1)//8*8 + 7 417 418 if start != last + 1: 419 if start - last <= 1+16*3: 420 print_block (None, last+1, start-1, data) 421 last = start-1 422 else: 423 if last >= 0: 424 ends.append (last + 1) 425 offset += ends[-1] - starts[-1] 426 print 427 print 428 print "#define use_offset_0x%04xu %d" % (start, offset) 429 starts.append (start) 430 431 print_block (block, start, end, data) 432 last = end 433ends.append (last + 1) 434offset += ends[-1] - starts[-1] 435print 436print 437occupancy = used * 100. / total 438page_bits = 12 439print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy) 440print 441print "USE_TABLE_ELEMENT_TYPE" 442print "hb_use_get_categories (hb_codepoint_t u)" 443print "{" 444print " switch (u >> %d)" % page_bits 445print " {" 446pages = set([u>>page_bits for u in starts+ends+singles.keys()]) 447for p in sorted(pages): 448 print " case 0x%0Xu:" % p 449 for (start,end) in zip (starts, ends): 450 if p not in [start>>page_bits, end>>page_bits]: continue 451 offset = "use_offset_0x%04xu" % start 452 print " if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset) 453 for u,d in singles.items (): 454 if p != u>>page_bits: continue 455 print " if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0]) 456 print " break;" 457 print "" 458print " default:" 459print " break;" 460print " }" 461print " return USE_O;" 462print "}" 463print 464for k in sorted(use_mapping.keys()): 465 if k in use_positions and use_positions[k]: continue 466 print "#undef %s" % k 467for k,v in sorted(use_positions.items()): 468 if not v: continue 469 for suf in v.keys(): 470 tag = k + suf 471 print "#undef %s" % tag 472print 473print "/* == End of generated table == */" 474 475# Maintain at least 50% occupancy in the table */ 476if occupancy < 50: 477 raise Exception ("Table too sparse, please investigate: ", occupancy) 478