• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2
3import sys
4
5if len (sys.argv) != 5:
6	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
7	sys.exit (1)
8
9BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
10
11files = [file (x) for x in sys.argv[1:]]
12
13headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
14headers.append (["UnicodeData.txt does not have a header."])
15
16data = [{} for f in files]
17values = [{} for f in files]
18for i, f in enumerate (files):
19	for line in f:
20
21		j = line.find ('#')
22		if j >= 0:
23			line = line[:j]
24
25		fields = [x.strip () for x in line.split (';')]
26		if len (fields) == 1:
27			continue
28
29		uu = fields[0].split ('..')
30		start = int (uu[0], 16)
31		if len (uu) == 1:
32			end = start
33		else:
34			end = int (uu[1], 16)
35
36		t = fields[1 if i != 2 else 2]
37
38		for u in range (start, end + 1):
39			data[i][u] = t
40		values[i][t] = values[i].get (t, 0) + end - start + 1
41
42defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
43
44# TODO Characters that are not in Unicode Indic files, but used in USE
45data[0][0x034F] = defaults[0]
46data[0][0x2060] = defaults[0]
47data[0][0x20F0] = defaults[0]
48for u in range (0xFE00, 0xFE0F + 1):
49	data[0][u] = defaults[0]
50
51# Merge data into one dict:
52for i,v in enumerate (defaults):
53	values[i][v] = values[i].get (v, 0) + 1
54combined = {}
55for i,d in enumerate (data):
56	for u,v in d.items ():
57		if i >= 2 and not u in combined:
58			continue
59		if not u in combined:
60			combined[u] = list (defaults)
61		combined[u][i] = v
62combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
63data = combined
64del combined
65num = len (data)
66
67
68property_names = [
69	# General_Category
70	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
71	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
72	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
73	# Indic_Syllabic_Category
74	'Other',
75	'Bindu',
76	'Visarga',
77	'Avagraha',
78	'Nukta',
79	'Virama',
80	'Pure_Killer',
81	'Invisible_Stacker',
82	'Vowel_Independent',
83	'Vowel_Dependent',
84	'Vowel',
85	'Consonant_Placeholder',
86	'Consonant',
87	'Consonant_Dead',
88	'Consonant_With_Stacker',
89	'Consonant_Prefixed',
90	'Consonant_Preceding_Repha',
91	'Consonant_Succeeding_Repha',
92	'Consonant_Subjoined',
93	'Consonant_Medial',
94	'Consonant_Final',
95	'Consonant_Head_Letter',
96	'Modifying_Letter',
97	'Tone_Letter',
98	'Tone_Mark',
99	'Gemination_Mark',
100	'Cantillation_Mark',
101	'Register_Shifter',
102	'Syllable_Modifier',
103	'Consonant_Killer',
104	'Non_Joiner',
105	'Joiner',
106	'Number_Joiner',
107	'Number',
108	'Brahmi_Joining_Number',
109	# Indic_Positional_Category
110	'Not_Applicable',
111	'Right',
112	'Left',
113	'Visual_Order_Left',
114	'Left_And_Right',
115	'Top',
116	'Bottom',
117	'Top_And_Bottom',
118	'Top_And_Right',
119	'Top_And_Left',
120	'Top_And_Left_And_Right',
121	'Bottom_And_Left',
122	'Bottom_And_Right',
123	'Top_And_Bottom_And_Right',
124	'Overstruck',
125]
126
127class PropertyValue(object):
128	def __init__(self, name_):
129		self.name = name_
130	def __str__(self):
131		return self.name
132	def __eq__(self, other):
133		return self.name == (other if isinstance(other, basestring) else other.name)
134	def __ne__(self, other):
135		return not (self == other)
136
137property_values = {}
138
139for name in property_names:
140	value = PropertyValue(name)
141	assert value not in property_values
142	assert value not in globals()
143	property_values[name] = value
144globals().update(property_values)
145
146
147def is_BASE(U, UISC, UGC):
148	return (UISC in [Number, Consonant, Consonant_Head_Letter,
149			#SPEC-DRAFT Consonant_Placeholder,
150			Tone_Letter,
151			Vowel_Independent #SPEC-DRAFT
152			] or
153		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
154					Consonant_Subjoined, Vowel, Vowel_Dependent]))
155def is_BASE_IND(U, UISC, UGC):
156	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
157	return (UISC in [Consonant_Dead, Modifying_Letter] or
158		(UGC == Po and not U in [0x104E, 0x2022, 0x11A3F, 0x11A45]) or
159		False # SPEC-DRAFT-OUTDATED! U == 0x002D
160		)
161def is_BASE_NUM(U, UISC, UGC):
162	return UISC == Brahmi_Joining_Number
163def is_BASE_OTHER(U, UISC, UGC):
164	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
165	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
166	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
167def is_CGJ(U, UISC, UGC):
168	return U == 0x034F
169def is_CONS_FINAL(U, UISC, UGC):
170	return ((UISC == Consonant_Final and UGC != Lo) or
171		UISC == Consonant_Succeeding_Repha)
172def is_CONS_FINAL_MOD(U, UISC, UGC):
173	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
174	return  UISC == Syllable_Modifier
175def is_CONS_MED(U, UISC, UGC):
176	return UISC == Consonant_Medial and UGC != Lo
177def is_CONS_MOD(U, UISC, UGC):
178	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
179def is_CONS_SUB(U, UISC, UGC):
180	#SPEC-DRAFT return UISC == Consonant_Subjoined
181	return UISC == Consonant_Subjoined and UGC != Lo
182def is_CONS_WITH_STACKER(U, UISC, UGC):
183	return UISC == Consonant_With_Stacker
184def is_HALANT(U, UISC, UGC):
185	return UISC in [Virama, Invisible_Stacker]
186def is_HALANT_NUM(U, UISC, UGC):
187	return UISC == Number_Joiner
188def is_ZWNJ(U, UISC, UGC):
189	return UISC == Non_Joiner
190def is_ZWJ(U, UISC, UGC):
191	return UISC == Joiner
192def is_Word_Joiner(U, UISC, UGC):
193	return U == 0x2060
194def is_OTHER(U, UISC, UGC):
195	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
196	return (UISC == Other
197		and not is_SYM_MOD(U, UISC, UGC)
198		and not is_CGJ(U, UISC, UGC)
199		and not is_Word_Joiner(U, UISC, UGC)
200		and not is_VARIATION_SELECTOR(U, UISC, UGC)
201	)
202def is_Reserved(U, UISC, UGC):
203	return UGC == 'Cn'
204def is_REPHA(U, UISC, UGC):
205	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
206def is_SYM(U, UISC, UGC):
207	if U == 0x25CC: return False #SPEC-DRAFT
208	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
209	return UGC in [So, Sc]
210def is_SYM_MOD(U, UISC, UGC):
211	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
212def is_VARIATION_SELECTOR(U, UISC, UGC):
213	return 0xFE00 <= U <= 0xFE0F
214def is_VOWEL(U, UISC, UGC):
215	# https://github.com/roozbehp/unicode-data/issues/6
216	return (UISC == Pure_Killer or
217		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
218def is_VOWEL_MOD(U, UISC, UGC):
219	# https://github.com/roozbehp/unicode-data/issues/6
220	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
221		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
222
223use_mapping = {
224	'B':	is_BASE,
225	'IND':	is_BASE_IND,
226	'N':	is_BASE_NUM,
227	'GB':	is_BASE_OTHER,
228	'CGJ':	is_CGJ,
229	'F':	is_CONS_FINAL,
230	'FM':	is_CONS_FINAL_MOD,
231	'M':	is_CONS_MED,
232	'CM':	is_CONS_MOD,
233	'SUB':	is_CONS_SUB,
234	'CS':	is_CONS_WITH_STACKER,
235	'H':	is_HALANT,
236	'HN':	is_HALANT_NUM,
237	'ZWNJ':	is_ZWNJ,
238	'ZWJ':	is_ZWJ,
239	'WJ':	is_Word_Joiner,
240	'O':	is_OTHER,
241	'Rsv':	is_Reserved,
242	'R':	is_REPHA,
243	'S':	is_SYM,
244	'SM':	is_SYM_MOD,
245	'VS':	is_VARIATION_SELECTOR,
246	'V':	is_VOWEL,
247	'VM':	is_VOWEL_MOD,
248}
249
250use_positions = {
251	'F': {
252		'Abv': [Top],
253		'Blw': [Bottom],
254		'Pst': [Right],
255	},
256	'M': {
257		'Abv': [Top],
258		'Blw': [Bottom, Bottom_And_Left],
259		'Pst': [Right],
260		'Pre': [Left],
261	},
262	'CM': {
263		'Abv': [Top],
264		'Blw': [Bottom],
265	},
266	'V': {
267		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
268		'Blw': [Bottom, Overstruck, Bottom_And_Right],
269		'Pst': [Right],
270		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
271	},
272	'VM': {
273		'Abv': [Top],
274		'Blw': [Bottom, Overstruck],
275		'Pst': [Right],
276		'Pre': [Left],
277	},
278	'SM': {
279		'Abv': [Top],
280		'Blw': [Bottom],
281	},
282	'H': None,
283	'B': None,
284	'FM': None,
285	'SUB': None,
286}
287
288def map_to_use(data):
289	out = {}
290	items = use_mapping.items()
291	for U,(UISC,UIPC,UGC,UBlock) in data.items():
292
293		# Resolve Indic_Syllabic_Category
294
295		# TODO: These don't have UISC assigned in Unicode 8.0, but
296		# have UIPC
297		if U == 0x17DD: UISC = Vowel_Dependent
298		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
299
300		# TODO: U+1CED should only be allowed after some of
301		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
302		if U == 0x1CED: UISC = Tone_Mark
303
304		# TODO: https://github.com/harfbuzz/harfbuzz/issues/525
305		if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
306
307		# TODO: https://github.com/harfbuzz/harfbuzz/pull/609
308		if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
309
310		# TODO: https://github.com/harfbuzz/harfbuzz/pull/626
311		if U == 0xA8B4: UISC = Consonant_Medial
312
313		values = [k for k,v in items if v(U,UISC,UGC)]
314		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
315		USE = values[0]
316
317		# Resolve Indic_Positional_Category
318
319		# TODO: Not in Unicode 8.0 yet, but in spec.
320		if U == 0x1B6C: UIPC = Bottom
321
322		# TODO: These should die, but have UIPC in Unicode 8.0
323		if U in [0x953, 0x954]: UIPC = Not_Applicable
324
325		# TODO: In USE's override list but not in Unicode 8.0
326		if U == 0x103C: UIPC = Left
327
328		# TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
329		if 0xA926 <= U <= 0xA92A: UIPC = Top
330		if U == 0x111CA: UIPC = Bottom
331		if U == 0x11300: UIPC = Top
332		if U == 0x1133C: UIPC = Bottom
333		if U == 0x1171E: UIPC = Left # Correct?!
334		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
335		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
336
337		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
338			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
339
340		pos_mapping = use_positions.get(USE, None)
341		if pos_mapping:
342			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
343			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
344			USE = USE + values[0]
345
346		out[U] = (USE, UBlock)
347	return out
348
349defaults = ('O', 'No_Block')
350data = map_to_use(data)
351
352# Remove the outliers
353singles = {}
354for u in [0x034F, 0x25CC, 0x1107F]:
355	singles[u] = data[u]
356	del data[u]
357
358print "/* == Start of generated table == */"
359print "/*"
360print " * The following table is generated by running:"
361print " *"
362print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
363print " *"
364print " * on files with these headers:"
365print " *"
366for h in headers:
367	for l in h:
368		print " * %s" % (l.strip())
369print " */"
370print
371print '#include "hb-ot-shape-complex-use-private.hh"'
372print
373
374total = 0
375used = 0
376last_block = None
377def print_block (block, start, end, data):
378	global total, used, last_block
379	if block and block != last_block:
380		print
381		print
382		print "  /* %s */" % block
383		if start % 16:
384			print ' ' * (20 + (start % 16 * 6)),
385	num = 0
386	assert start % 8 == 0
387	assert (end+1) % 8 == 0
388	for u in range (start, end+1):
389		if u % 16 == 0:
390			print
391			print "  /* %04X */" % u,
392		if u in data:
393			num += 1
394		d = data.get (u, defaults)
395		sys.stdout.write ("%6s," % d[0])
396
397	total += end - start + 1
398	used += num
399	if block:
400		last_block = block
401
402uu = data.keys ()
403uu.sort ()
404
405last = -100000
406num = 0
407offset = 0
408starts = []
409ends = []
410for k,v in sorted(use_mapping.items()):
411	if k in use_positions and use_positions[k]: continue
412	print "#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:])
413for k,v in sorted(use_positions.items()):
414	if not v: continue
415	for suf in v.keys():
416		tag = k + suf
417		print "#define %s	USE_%s" % (tag, tag)
418print ""
419print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
420for u in uu:
421	if u <= last:
422		continue
423	block = data[u][1]
424
425	start = u//8*8
426	end = start+1
427	while end in uu and block == data[end][1]:
428		end += 1
429	end = (end-1)//8*8 + 7
430
431	if start != last + 1:
432		if start - last <= 1+16*3:
433			print_block (None, last+1, start-1, data)
434			last = start-1
435		else:
436			if last >= 0:
437				ends.append (last + 1)
438				offset += ends[-1] - starts[-1]
439			print
440			print
441			print "#define use_offset_0x%04xu %d" % (start, offset)
442			starts.append (start)
443
444	print_block (block, start, end, data)
445	last = end
446ends.append (last + 1)
447offset += ends[-1] - starts[-1]
448print
449print
450occupancy = used * 100. / total
451page_bits = 12
452print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
453print
454print "USE_TABLE_ELEMENT_TYPE"
455print "hb_use_get_categories (hb_codepoint_t u)"
456print "{"
457print "  switch (u >> %d)" % page_bits
458print "  {"
459pages = set([u>>page_bits for u in starts+ends+singles.keys()])
460for p in sorted(pages):
461	print "    case 0x%0Xu:" % p
462	for (start,end) in zip (starts, ends):
463		if p not in [start>>page_bits, end>>page_bits]: continue
464		offset = "use_offset_0x%04xu" % start
465		print "      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
466	for u,d in singles.items ():
467		if p != u>>page_bits: continue
468		print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
469	print "      break;"
470	print ""
471print "    default:"
472print "      break;"
473print "  }"
474print "  return USE_O;"
475print "}"
476print
477for k in sorted(use_mapping.keys()):
478	if k in use_positions and use_positions[k]: continue
479	print "#undef %s" % k
480for k,v in sorted(use_positions.items()):
481	if not v: continue
482	for suf in v.keys():
483		tag = k + suf
484		print "#undef %s" % tag
485print
486print "/* == End of generated table == */"
487
488# Maintain at least 50% occupancy in the table */
489if occupancy < 50:
490	raise Exception ("Table too sparse, please investigate: ", occupancy)
491