• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2
3import sys
4
5if len (sys.argv) != 5:
6	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
7	sys.exit (1)
8
9BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
10
11files = [file (x) for x in sys.argv[1:]]
12
13headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
14headers.append (["UnicodeData.txt does not have a header."])
15
16data = [{} for f in files]
17values = [{} for f in files]
18for i, f in enumerate (files):
19	for line in f:
20
21		j = line.find ('#')
22		if j >= 0:
23			line = line[:j]
24
25		fields = [x.strip () for x in line.split (';')]
26		if len (fields) == 1:
27			continue
28
29		uu = fields[0].split ('..')
30		start = int (uu[0], 16)
31		if len (uu) == 1:
32			end = start
33		else:
34			end = int (uu[1], 16)
35
36		t = fields[1 if i != 2 else 2]
37
38		for u in range (start, end + 1):
39			data[i][u] = t
40		values[i][t] = values[i].get (t, 0) + end - start + 1
41
42defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
43
44# TODO Characters that are not in Unicode Indic files, but used in USE
45data[0][0x034F] = defaults[0]
46data[0][0x2060] = defaults[0]
47for u in range (0xFE00, 0xFE0F + 1):
48	data[0][u] = defaults[0]
49
50# Merge data into one dict:
51for i,v in enumerate (defaults):
52	values[i][v] = values[i].get (v, 0) + 1
53combined = {}
54for i,d in enumerate (data):
55	for u,v in d.items ():
56		if i >= 2 and not u in combined:
57			continue
58		if not u in combined:
59			combined[u] = list (defaults)
60		combined[u][i] = v
61combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
62data = combined
63del combined
64num = len (data)
65
66
67property_names = [
68	# General_Category
69	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
70	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
71	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
72	# Indic_Syllabic_Category
73	'Other',
74	'Bindu',
75	'Visarga',
76	'Avagraha',
77	'Nukta',
78	'Virama',
79	'Pure_Killer',
80	'Invisible_Stacker',
81	'Vowel_Independent',
82	'Vowel_Dependent',
83	'Vowel',
84	'Consonant_Placeholder',
85	'Consonant',
86	'Consonant_Dead',
87	'Consonant_With_Stacker',
88	'Consonant_Prefixed',
89	'Consonant_Preceding_Repha',
90	'Consonant_Succeeding_Repha',
91	'Consonant_Subjoined',
92	'Consonant_Medial',
93	'Consonant_Final',
94	'Consonant_Head_Letter',
95	'Modifying_Letter',
96	'Tone_Letter',
97	'Tone_Mark',
98	'Gemination_Mark',
99	'Cantillation_Mark',
100	'Register_Shifter',
101	'Syllable_Modifier',
102	'Consonant_Killer',
103	'Non_Joiner',
104	'Joiner',
105	'Number_Joiner',
106	'Number',
107	'Brahmi_Joining_Number',
108	# Indic_Positional_Category
109	'Not_Applicable',
110	'Right',
111	'Left',
112	'Visual_Order_Left',
113	'Left_And_Right',
114	'Top',
115	'Bottom',
116	'Top_And_Bottom',
117	'Top_And_Right',
118	'Top_And_Left',
119	'Top_And_Left_And_Right',
120	'Bottom_And_Right',
121	'Top_And_Bottom_And_Right',
122	'Overstruck',
123]
124
125class PropertyValue(object):
126	def __init__(self, name_):
127		self.name = name_
128	def __str__(self):
129		return self.name
130	def __eq__(self, other):
131		return self.name == (other if isinstance(other, basestring) else other.name)
132	def __ne__(self, other):
133		return not (self == other)
134
135property_values = {}
136
137for name in property_names:
138	value = PropertyValue(name)
139	assert value not in property_values
140	assert value not in globals()
141	property_values[name] = value
142globals().update(property_values)
143
144
145def is_BASE(U, UISC, UGC):
146	return (UISC in [Number, Consonant, Consonant_Head_Letter,
147			#SPEC-DRAFT Consonant_Placeholder,
148			Tone_Letter,
149			Vowel_Independent #SPEC-DRAFT
150			] or
151		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
152					Consonant_Subjoined, Vowel, Vowel_Dependent]))
153def is_BASE_IND(U, UISC, UGC):
154	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
155	return (UISC in [Consonant_Dead, Modifying_Letter] or
156		(UGC == Po and not U in [0x104E, 0x2022]) or
157		False # SPEC-DRAFT-OUTDATED! U == 0x002D
158		)
159def is_BASE_NUM(U, UISC, UGC):
160	return UISC == Brahmi_Joining_Number
161def is_BASE_OTHER(U, UISC, UGC):
162	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
163	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
164	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
165def is_CGJ(U, UISC, UGC):
166	return U == 0x034F
167def is_CONS_FINAL(U, UISC, UGC):
168	return ((UISC == Consonant_Final and UGC != Lo) or
169		UISC == Consonant_Succeeding_Repha)
170def is_CONS_FINAL_MOD(U, UISC, UGC):
171	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
172	return  UISC == Syllable_Modifier
173def is_CONS_MED(U, UISC, UGC):
174	return UISC == Consonant_Medial and UGC != Lo
175def is_CONS_MOD(U, UISC, UGC):
176	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
177def is_CONS_SUB(U, UISC, UGC):
178	#SPEC-DRAFT return UISC == Consonant_Subjoined
179	return UISC == Consonant_Subjoined and UGC != Lo
180def is_HALANT(U, UISC, UGC):
181	return UISC in [Virama, Invisible_Stacker]
182def is_HALANT_NUM(U, UISC, UGC):
183	return UISC == Number_Joiner
184def is_ZWNJ(U, UISC, UGC):
185	return UISC == Non_Joiner
186def is_ZWJ(U, UISC, UGC):
187	return UISC == Joiner
188def is_Word_Joiner(U, UISC, UGC):
189	return U == 0x2060
190def is_OTHER(U, UISC, UGC):
191	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
192	return (UISC == Other
193		and not is_SYM_MOD(U, UISC, UGC)
194		and not is_CGJ(U, UISC, UGC)
195		and not is_Word_Joiner(U, UISC, UGC)
196		and not is_VARIATION_SELECTOR(U, UISC, UGC)
197	)
198def is_Reserved(U, UISC, UGC):
199	return UGC == 'Cn'
200def is_REPHA(U, UISC, UGC):
201	#return UISC == Consonant_Preceding_Repha
202	#SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed
203	return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed]
204def is_SYM(U, UISC, UGC):
205	if U == 0x25CC: return False #SPEC-DRAFT
206	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
207	return UGC in [So, Sc]
208def is_SYM_MOD(U, UISC, UGC):
209	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
210def is_VARIATION_SELECTOR(U, UISC, UGC):
211	return 0xFE00 <= U <= 0xFE0F
212def is_VOWEL(U, UISC, UGC):
213	return (UISC == Pure_Killer or
214		(UGC != Lo and UISC in [Vowel, Vowel_Dependent]))
215def is_VOWEL_MOD(U, UISC, UGC):
216	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
217		(UGC != Lo and UISC == Bindu))
218
219use_mapping = {
220	'B':	is_BASE,
221	'IND':	is_BASE_IND,
222	'N':	is_BASE_NUM,
223	'GB':	is_BASE_OTHER,
224	'CGJ':	is_CGJ,
225	'F':	is_CONS_FINAL,
226	'FM':	is_CONS_FINAL_MOD,
227	'M':	is_CONS_MED,
228	'CM':	is_CONS_MOD,
229	'SUB':	is_CONS_SUB,
230	'H':	is_HALANT,
231	'HN':	is_HALANT_NUM,
232	'ZWNJ':	is_ZWNJ,
233	'ZWJ':	is_ZWJ,
234	'WJ':	is_Word_Joiner,
235	'O':	is_OTHER,
236	'Rsv':	is_Reserved,
237	'R':	is_REPHA,
238	'S':	is_SYM,
239	'SM':	is_SYM_MOD,
240	'VS':	is_VARIATION_SELECTOR,
241	'V':	is_VOWEL,
242	'VM':	is_VOWEL_MOD,
243}
244
245use_positions = {
246	'F': {
247		'Abv': [Top],
248		'Blw': [Bottom],
249		'Pst': [Right],
250	},
251	'M': {
252		'Abv': [Top],
253		'Blw': [Bottom],
254		'Pst': [Right],
255		'Pre': [Left],
256	},
257	'CM': {
258		'Abv': [Top],
259		'Blw': [Bottom],
260	},
261	'V': {
262		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
263		'Blw': [Bottom, Overstruck, Bottom_And_Right],
264		'Pst': [Right],
265		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
266	},
267	'VM': {
268		'Abv': [Top],
269		'Blw': [Bottom, Overstruck],
270		'Pst': [Right],
271		'Pre': [Left],
272	},
273	'SM': {
274		'Abv': [Top],
275		'Blw': [Bottom],
276	},
277	'H': None,
278	'B': None,
279	'FM': None,
280	'SUB': None,
281}
282
283def map_to_use(data):
284	out = {}
285	items = use_mapping.items()
286	for U,(UISC,UIPC,UGC,UBlock) in data.items():
287
288		# Resolve Indic_Syllabic_Category
289
290		# TODO: These don't have UISC assigned in Unicode 8.0, but
291		# have UIPC
292		if U == 0x17DD: UISC = Vowel_Dependent
293		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
294
295		# TODO: U+1CED should only be allowed after some of
296		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
297		if U == 0x1CED: UISC = Tone_Mark
298
299		evals = [(k, v(U,UISC,UGC)) for k,v in items]
300		values = [k for k,v in evals if v]
301		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
302		USE = values[0]
303
304		# Resolve Indic_Positional_Category
305
306		# TODO: Not in Unicode 8.0 yet, but in spec.
307		if U == 0x1B6C: UIPC = Bottom
308
309		# TODO: These should die, but have UIPC in Unicode 8.0
310		if U in [0x953, 0x954]: UIPC = Not_Applicable
311
312		# TODO: In USE's override list but not in Unicode 8.0
313		if U == 0x103C: UIPC = Left
314
315		# TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
316		if 0xA926 <= U <= 0xA92A: UIPC = Top
317		if U == 0x111CA: UIPC = Bottom
318		if U == 0x11300: UIPC = Top
319		if U == 0x1133C: UIPC = Bottom
320		if U == 0x1171E: UIPC = Left # Correct?!
321		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
322		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
323
324		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
325			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
326
327		pos_mapping = use_positions.get(USE, None)
328		if pos_mapping:
329			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
330			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
331			USE = USE + values[0]
332
333		out[U] = (USE, UBlock)
334	return out
335
336defaults = ('O', 'No_Block')
337data = map_to_use(data)
338
339# Remove the outliers
340singles = {}
341for u in [0x034F, 0x25CC, 0x1107F]:
342	singles[u] = data[u]
343	del data[u]
344
345print "/* == Start of generated table == */"
346print "/*"
347print " * The following table is generated by running:"
348print " *"
349print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
350print " *"
351print " * on files with these headers:"
352print " *"
353for h in headers:
354	for l in h:
355		print " * %s" % (l.strip())
356print " */"
357print
358print '#include "hb-ot-shape-complex-use-private.hh"'
359print
360
361total = 0
362used = 0
363last_block = None
364def print_block (block, start, end, data):
365	global total, used, last_block
366	if block and block != last_block:
367		print
368		print
369		print "  /* %s */" % block
370		if start % 16:
371			print ' ' * (20 + (start % 16 * 6)),
372	num = 0
373	assert start % 8 == 0
374	assert (end+1) % 8 == 0
375	for u in range (start, end+1):
376		if u % 16 == 0:
377			print
378			print "  /* %04X */" % u,
379		if u in data:
380			num += 1
381		d = data.get (u, defaults)
382		sys.stdout.write ("%6s," % d[0])
383
384	total += end - start + 1
385	used += num
386	if block:
387		last_block = block
388
389uu = data.keys ()
390uu.sort ()
391
392last = -100000
393num = 0
394offset = 0
395starts = []
396ends = []
397for k,v in sorted(use_mapping.items()):
398	if k in use_positions and use_positions[k]: continue
399	print "#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:])
400for k,v in sorted(use_positions.items()):
401	if not v: continue
402	for suf in v.keys():
403		tag = k + suf
404		print "#define %s	USE_%s" % (tag, tag)
405print ""
406print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
407for u in uu:
408	if u <= last:
409		continue
410	block = data[u][1]
411
412	start = u//8*8
413	end = start+1
414	while end in uu and block == data[end][1]:
415		end += 1
416	end = (end-1)//8*8 + 7
417
418	if start != last + 1:
419		if start - last <= 1+16*3:
420			print_block (None, last+1, start-1, data)
421			last = start-1
422		else:
423			if last >= 0:
424				ends.append (last + 1)
425				offset += ends[-1] - starts[-1]
426			print
427			print
428			print "#define use_offset_0x%04xu %d" % (start, offset)
429			starts.append (start)
430
431	print_block (block, start, end, data)
432	last = end
433ends.append (last + 1)
434offset += ends[-1] - starts[-1]
435print
436print
437occupancy = used * 100. / total
438page_bits = 12
439print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
440print
441print "USE_TABLE_ELEMENT_TYPE"
442print "hb_use_get_categories (hb_codepoint_t u)"
443print "{"
444print "  switch (u >> %d)" % page_bits
445print "  {"
446pages = set([u>>page_bits for u in starts+ends+singles.keys()])
447for p in sorted(pages):
448	print "    case 0x%0Xu:" % p
449	for (start,end) in zip (starts, ends):
450		if p not in [start>>page_bits, end>>page_bits]: continue
451		offset = "use_offset_0x%04xu" % start
452		print "      if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
453	for u,d in singles.items ():
454		if p != u>>page_bits: continue
455		print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
456	print "      break;"
457	print ""
458print "    default:"
459print "      break;"
460print "  }"
461print "  return USE_O;"
462print "}"
463print
464for k in sorted(use_mapping.keys()):
465	if k in use_positions and use_positions[k]: continue
466	print "#undef %s" % k
467for k,v in sorted(use_positions.items()):
468	if not v: continue
469	for suf in v.keys():
470		tag = k + suf
471		print "#undef %s" % tag
472print
473print "/* == End of generated table == */"
474
475# Maintain at least 50% occupancy in the table */
476if occupancy < 50:
477	raise Exception ("Table too sparse, please investigate: ", occupancy)
478