1#!/usr/bin/env python 2 3from __future__ import print_function, division, absolute_import 4 5import io, sys 6 7if len (sys.argv) != 4: 8 print ("usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt", file=sys.stderr) 9 sys.exit (1) 10 11ALLOWED_SINGLES = [0x00A0, 0x25CC] 12ALLOWED_BLOCKS = [ 13 'Basic Latin', 14 'Latin-1 Supplement', 15 'Devanagari', 16 'Bengali', 17 'Gurmukhi', 18 'Gujarati', 19 'Oriya', 20 'Tamil', 21 'Telugu', 22 'Kannada', 23 'Malayalam', 24 'Sinhala', 25 'Myanmar', 26 'Khmer', 27 'Vedic Extensions', 28 'General Punctuation', 29 'Superscripts and Subscripts', 30 'Devanagari Extended', 31 'Myanmar Extended-B', 32 'Myanmar Extended-A', 33] 34 35files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 36 37headers = [[f.readline () for i in range (2)] for f in files] 38 39data = [{} for f in files] 40values = [{} for f in files] 41for i, f in enumerate (files): 42 for line in f: 43 44 j = line.find ('#') 45 if j >= 0: 46 line = line[:j] 47 48 fields = [x.strip () for x in line.split (';')] 49 if len (fields) == 1: 50 continue 51 52 uu = fields[0].split ('..') 53 start = int (uu[0], 16) 54 if len (uu) == 1: 55 end = start 56 else: 57 end = int (uu[1], 16) 58 59 t = fields[1] 60 61 for u in range (start, end + 1): 62 data[i][u] = t 63 values[i][t] = values[i].get (t, 0) + end - start + 1 64 65# Merge data into one dict: 66defaults = ('Other', 'Not_Applicable', 'No_Block') 67for i,v in enumerate (defaults): 68 values[i][v] = values[i].get (v, 0) + 1 69combined = {} 70for i,d in enumerate (data): 71 for u,v in d.items (): 72 if i == 2 and not u in combined: 73 continue 74 if not u in combined: 75 combined[u] = list (defaults) 76 combined[u][i] = v 77combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} 78data = combined 79del combined 80num = len (data) 81 82for u in [0x17CD, 0x17CE, 0x17CF, 0x17D0, 0x17D3]: 83 if data[u][0] == 'Other': 84 data[u][0] = "Vowel_Dependent" 85 86# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out 87singles = {} 88for u in ALLOWED_SINGLES: 89 singles[u] = data[u] 90 del data[u] 91 92print ("/* == Start of generated table == */") 93print ("/*") 94print (" * The following table is generated by running:") 95print (" *") 96print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") 97print (" *") 98print (" * on files with these headers:") 99print (" *") 100for h in headers: 101 for l in h: 102 print (" * %s" % (l.strip())) 103print (" */") 104print () 105print ('#include "hb-ot-shape-complex-indic.hh"') 106print () 107 108# Shorten values 109short = [{ 110 "Bindu": 'Bi', 111 "Cantillation_Mark": 'Ca', 112 "Joiner": 'ZWJ', 113 "Non_Joiner": 'ZWNJ', 114 "Number": 'Nd', 115 "Visarga": 'Vs', 116 "Vowel": 'Vo', 117 "Vowel_Dependent": 'M', 118 "Consonant_Prefixed": 'CPrf', 119 "Other": 'x', 120},{ 121 "Not_Applicable": 'x', 122}] 123all_shorts = [{},{}] 124 125# Add some of the values, to make them more readable, and to avoid duplicates 126 127 128for i in range (2): 129 for v,s in short[i].items (): 130 all_shorts[i][s] = v 131 132what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"] 133what_short = ["ISC", "IMC"] 134for i in range (2): 135 print () 136 vv = sorted (values[i].keys ()) 137 for v in vv: 138 v_no_and = v.replace ('_And_', '_') 139 if v in short[i]: 140 s = short[i][v] 141 else: 142 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) 143 if s in all_shorts[i]: 144 raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) 145 all_shorts[i][s] = v 146 short[i][v] = s 147 print ("#define %s_%s %s_%s %s/* %3d chars; %s */" % 148 (what_short[i], s, what[i], v.upper (), 149 ' '* ((48-1 - len (what[i]) - 1 - len (v)) // 8), 150 values[i][v], v)) 151print () 152print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)") 153print () 154print () 155 156total = 0 157used = 0 158last_block = None 159def print_block (block, start, end, data): 160 global total, used, last_block 161 if block and block != last_block: 162 print () 163 print () 164 print (" /* %s */" % block) 165 num = 0 166 assert start % 8 == 0 167 assert (end+1) % 8 == 0 168 for u in range (start, end+1): 169 if u % 8 == 0: 170 print () 171 print (" /* %04X */" % u, end="") 172 if u in data: 173 num += 1 174 d = data.get (u, defaults) 175 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") 176 177 total += end - start + 1 178 used += num 179 if block: 180 last_block = block 181 182uu = sorted (data.keys ()) 183 184last = -100000 185num = 0 186offset = 0 187starts = [] 188ends = [] 189print ("static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {") 190for u in uu: 191 if u <= last: 192 continue 193 block = data[u][2] 194 195 start = u//8*8 196 end = start+1 197 while end in uu and block == data[end][2]: 198 end += 1 199 end = (end-1)//8*8 + 7 200 201 if start != last + 1: 202 if start - last <= 1+16*3: 203 print_block (None, last+1, start-1, data) 204 last = start-1 205 else: 206 if last >= 0: 207 ends.append (last + 1) 208 offset += ends[-1] - starts[-1] 209 print () 210 print () 211 print ("#define indic_offset_0x%04xu %d" % (start, offset)) 212 starts.append (start) 213 214 print_block (block, start, end, data) 215 last = end 216ends.append (last + 1) 217offset += ends[-1] - starts[-1] 218print () 219print () 220occupancy = used * 100. / total 221page_bits = 12 222print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 223print () 224print ("INDIC_TABLE_ELEMENT_TYPE") 225print ("hb_indic_get_categories (hb_codepoint_t u)") 226print ("{") 227print (" switch (u >> %d)" % page_bits) 228print (" {") 229pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) 230for p in sorted(pages): 231 print (" case 0x%0Xu:" % p) 232 for u,d in singles.items (): 233 if p != u>>page_bits: continue 234 print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) 235 for (start,end) in zip (starts, ends): 236 if p not in [start>>page_bits, end>>page_bits]: continue 237 offset = "indic_offset_0x%04xu" % start 238 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 239 print (" break;") 240 print ("") 241print (" default:") 242print (" break;") 243print (" }") 244print (" return _(x,x);") 245print ("}") 246print () 247print ("#undef _") 248for i in range (2): 249 print 250 vv = sorted (values[i].keys ()) 251 for v in vv: 252 print ("#undef %s_%s" % 253 (what_short[i], short[i][v])) 254print () 255print ("/* == End of generated table == */") 256 257# Maintain at least 30% occupancy in the table */ 258if occupancy < 30: 259 raise Exception ("Table too sparse, please investigate: ", occupancy) 260