1#!/usr/bin/env python3 2 3"""usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt 4 5Input files: 6* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 7* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 8* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 9""" 10 11import sys 12 13if len (sys.argv) != 4: 14 sys.exit (__doc__) 15 16ALLOWED_SINGLES = [0x00A0, 0x25CC] 17ALLOWED_BLOCKS = [ 18 'Basic Latin', 19 'Latin-1 Supplement', 20 'Devanagari', 21 'Bengali', 22 'Gurmukhi', 23 'Gujarati', 24 'Oriya', 25 'Tamil', 26 'Telugu', 27 'Kannada', 28 'Malayalam', 29 'Sinhala', 30 'Myanmar', 31 'Khmer', 32 'Vedic Extensions', 33 'General Punctuation', 34 'Superscripts and Subscripts', 35 'Devanagari Extended', 36 'Myanmar Extended-B', 37 'Myanmar Extended-A', 38] 39 40files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 41 42headers = [[f.readline () for i in range (2)] for f in files] 43 44data = [{} for _ in files] 45values = [{} for _ in files] 46for i, f in enumerate (files): 47 for line in f: 48 49 j = line.find ('#') 50 if j >= 0: 51 line = line[:j] 52 53 fields = [x.strip () for x in line.split (';')] 54 if len (fields) == 1: 55 continue 56 57 uu = fields[0].split ('..') 58 start = int (uu[0], 16) 59 if len (uu) == 1: 60 end = start 61 else: 62 end = int (uu[1], 16) 63 64 t = fields[1] 65 66 for u in range (start, end + 1): 67 data[i][u] = t 68 values[i][t] = values[i].get (t, 0) + end - start + 1 69 70# Merge data into one dict: 71defaults = ('Other', 'Not_Applicable', 'No_Block') 72for i,v in enumerate (defaults): 73 values[i][v] = values[i].get (v, 0) + 1 74combined = {} 75for i,d in enumerate (data): 76 for u,v in d.items (): 77 if i == 2 and not u in combined: 78 continue 79 if not u in combined: 80 combined[u] = list (defaults) 81 combined[u][i] = v 82combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} 83data = combined 84del combined 85 86# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out 87singles = {} 88for u in ALLOWED_SINGLES: 89 singles[u] = data[u] 90 del data[u] 91 92print ("/* == Start of generated table == */") 93print ("/*") 94print (" * The following table is generated by running:") 95print (" *") 96print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") 97print (" *") 98print (" * on files with these headers:") 99print (" *") 100for h in headers: 101 for l in h: 102 print (" * %s" % (l.strip())) 103print (" */") 104print () 105print ('#include "hb.hh"') 106print () 107print ('#ifndef HB_NO_OT_SHAPE') 108print () 109print ('#include "hb-ot-shape-complex-indic.hh"') 110print () 111 112# Shorten values 113short = [{ 114 "Bindu": 'Bi', 115 "Cantillation_Mark": 'Ca', 116 "Joiner": 'ZWJ', 117 "Non_Joiner": 'ZWNJ', 118 "Number": 'Nd', 119 "Visarga": 'Vs', 120 "Vowel": 'Vo', 121 "Vowel_Dependent": 'M', 122 "Consonant_Prefixed": 'CPrf', 123 "Other": 'x', 124},{ 125 "Not_Applicable": 'x', 126}] 127all_shorts = [{},{}] 128 129# Add some of the values, to make them more readable, and to avoid duplicates 130 131 132for i in range (2): 133 for v,s in short[i].items (): 134 all_shorts[i][s] = v 135 136what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"] 137what_short = ["ISC", "IMC"] 138print ('#pragma GCC diagnostic push') 139print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 140cat_defs = [] 141for i in range (2): 142 vv = sorted (values[i].keys ()) 143 for v in vv: 144 v_no_and = v.replace ('_And_', '_') 145 if v in short[i]: 146 s = short[i][v] 147 else: 148 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) 149 if s in all_shorts[i]: 150 raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) 151 all_shorts[i][s] = v 152 short[i][v] = s 153 cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + v.upper (), str (values[i][v]), v)) 154 155maxlen_s = max ([len (c[0]) for c in cat_defs]) 156maxlen_l = max ([len (c[1]) for c in cat_defs]) 157maxlen_n = max ([len (c[2]) for c in cat_defs]) 158for s in what_short: 159 print () 160 for c in [c for c in cat_defs if s in c[0]]: 161 print ("#define %s %s /* %s chars; %s */" % 162 (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3])) 163print () 164print ('#pragma GCC diagnostic pop') 165print () 166print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)") 167print () 168print () 169 170total = 0 171used = 0 172last_block = None 173def print_block (block, start, end, data): 174 global total, used, last_block 175 if block and block != last_block: 176 print () 177 print () 178 print (" /* %s */" % block) 179 num = 0 180 assert start % 8 == 0 181 assert (end+1) % 8 == 0 182 for u in range (start, end+1): 183 if u % 8 == 0: 184 print () 185 print (" /* %04X */" % u, end="") 186 if u in data: 187 num += 1 188 d = data.get (u, defaults) 189 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") 190 191 total += end - start + 1 192 used += num 193 if block: 194 last_block = block 195 196uu = sorted (data.keys ()) 197 198last = -100000 199num = 0 200offset = 0 201starts = [] 202ends = [] 203print ("static const uint16_t indic_table[] = {") 204for u in uu: 205 if u <= last: 206 continue 207 block = data[u][2] 208 209 start = u//8*8 210 end = start+1 211 while end in uu and block == data[end][2]: 212 end += 1 213 end = (end-1)//8*8 + 7 214 215 if start != last + 1: 216 if start - last <= 1+16*3: 217 print_block (None, last+1, start-1, data) 218 else: 219 if last >= 0: 220 ends.append (last + 1) 221 offset += ends[-1] - starts[-1] 222 print () 223 print () 224 print ("#define indic_offset_0x%04xu %d" % (start, offset)) 225 starts.append (start) 226 227 print_block (block, start, end, data) 228 last = end 229ends.append (last + 1) 230offset += ends[-1] - starts[-1] 231print () 232print () 233occupancy = used * 100. / total 234page_bits = 12 235print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 236print () 237print ("uint16_t") 238print ("hb_indic_get_categories (hb_codepoint_t u)") 239print ("{") 240print (" switch (u >> %d)" % page_bits) 241print (" {") 242pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) 243for p in sorted(pages): 244 print (" case 0x%0Xu:" % p) 245 for u,d in singles.items (): 246 if p != u>>page_bits: continue 247 print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) 248 for (start,end) in zip (starts, ends): 249 if p not in [start>>page_bits, end>>page_bits]: continue 250 offset = "indic_offset_0x%04xu" % start 251 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 252 print (" break;") 253 print ("") 254print (" default:") 255print (" break;") 256print (" }") 257print (" return _(x,x);") 258print ("}") 259print () 260print ("#undef _") 261for i in range (2): 262 print () 263 vv = sorted (values[i].keys ()) 264 for v in vv: 265 print ("#undef %s_%s" % 266 (what_short[i], short[i][v])) 267print () 268print ('#endif') 269print () 270print ("/* == End of generated table == */") 271 272# Maintain at least 30% occupancy in the table */ 273if occupancy < 30: 274 raise Exception ("Table too sparse, please investigate: ", occupancy) 275