• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# flake8: noqa
3
4from __future__ import print_function, division, absolute_import
5
6import io
7import sys
8
9if len (sys.argv) != 5:
10	print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
11	sys.exit (1)
12
13BLACKLISTED_BLOCKS = ["Thai", "Lao"]
14
15files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
16
17headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
18headers.append (["UnicodeData.txt does not have a header."])
19
20data = [{} for f in files]
21values = [{} for f in files]
22for i, f in enumerate (files):
23	for line in f:
24
25		j = line.find ('#')
26		if j >= 0:
27			line = line[:j]
28
29		fields = [x.strip () for x in line.split (';')]
30		if len (fields) == 1:
31			continue
32
33		uu = fields[0].split ('..')
34		start = int (uu[0], 16)
35		if len (uu) == 1:
36			end = start
37		else:
38			end = int (uu[1], 16)
39
40		t = fields[1 if i != 2 else 2]
41
42		for u in range (start, end + 1):
43			data[i][u] = t
44		values[i][t] = values[i].get (t, 0) + end - start + 1
45
46defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
47
48# TODO Characters that are not in Unicode Indic files, but used in USE
49data[0][0x034F] = defaults[0]
50data[0][0x1B61] = defaults[0]
51data[0][0x1B63] = defaults[0]
52data[0][0x1B64] = defaults[0]
53data[0][0x1B65] = defaults[0]
54data[0][0x1B66] = defaults[0]
55data[0][0x1B67] = defaults[0]
56data[0][0x1B69] = defaults[0]
57data[0][0x1B6A] = defaults[0]
58data[0][0x2060] = defaults[0]
59# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
60data[0][0x1B5B] = 'Consonant_Placeholder'
61data[0][0x1B5C] = 'Consonant_Placeholder'
62data[0][0x1B5F] = 'Consonant_Placeholder'
63data[0][0x1B62] = 'Consonant_Placeholder'
64data[0][0x1B68] = 'Consonant_Placeholder'
65# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
66data[0][0x11C44] = 'Consonant_Placeholder'
67data[0][0x11C45] = 'Consonant_Placeholder'
68# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
69data[0][0x111C8] = 'Consonant_Placeholder'
70for u in range (0xFE00, 0xFE0F + 1):
71	data[0][u] = defaults[0]
72
73# Merge data into one dict:
74for i,v in enumerate (defaults):
75	values[i][v] = values[i].get (v, 0) + 1
76combined = {}
77for i,d in enumerate (data):
78	for u,v in d.items ():
79		if i >= 2 and not u in combined:
80			continue
81		if not u in combined:
82			combined[u] = list (defaults)
83		combined[u][i] = v
84combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
85data = combined
86del combined
87num = len (data)
88
89
90property_names = [
91	# General_Category
92	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
93	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
94	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
95	# Indic_Syllabic_Category
96	'Other',
97	'Bindu',
98	'Visarga',
99	'Avagraha',
100	'Nukta',
101	'Virama',
102	'Pure_Killer',
103	'Invisible_Stacker',
104	'Vowel_Independent',
105	'Vowel_Dependent',
106	'Vowel',
107	'Consonant_Placeholder',
108	'Consonant',
109	'Consonant_Dead',
110	'Consonant_With_Stacker',
111	'Consonant_Prefixed',
112	'Consonant_Preceding_Repha',
113	'Consonant_Succeeding_Repha',
114	'Consonant_Subjoined',
115	'Consonant_Medial',
116	'Consonant_Final',
117	'Consonant_Head_Letter',
118	'Consonant_Initial_Postfixed',
119	'Modifying_Letter',
120	'Tone_Letter',
121	'Tone_Mark',
122	'Gemination_Mark',
123	'Cantillation_Mark',
124	'Register_Shifter',
125	'Syllable_Modifier',
126	'Consonant_Killer',
127	'Non_Joiner',
128	'Joiner',
129	'Number_Joiner',
130	'Number',
131	'Brahmi_Joining_Number',
132	# Indic_Positional_Category
133	'Not_Applicable',
134	'Right',
135	'Left',
136	'Visual_Order_Left',
137	'Left_And_Right',
138	'Top',
139	'Bottom',
140	'Top_And_Bottom',
141	'Top_And_Right',
142	'Top_And_Left',
143	'Top_And_Left_And_Right',
144	'Bottom_And_Left',
145	'Bottom_And_Right',
146	'Top_And_Bottom_And_Right',
147	'Overstruck',
148]
149
150try:
151	basestring
152except NameError:
153	basestring = str
154
155class PropertyValue(object):
156	def __init__(self, name_):
157		self.name = name_
158	def __str__(self):
159		return self.name
160	def __eq__(self, other):
161		return self.name == (other if isinstance(other, basestring) else other.name)
162	def __ne__(self, other):
163		return not (self == other)
164	def __hash__(self):
165		return hash(str(self))
166
167property_values = {}
168
169for name in property_names:
170	value = PropertyValue(name)
171	assert value not in property_values
172	assert value not in globals()
173	property_values[name] = value
174globals().update(property_values)
175
176
177def is_BASE(U, UISC, UGC):
178	return (UISC in [Number, Consonant, Consonant_Head_Letter,
179			#SPEC-DRAFT Consonant_Placeholder,
180			Tone_Letter,
181			Vowel_Independent #SPEC-DRAFT
182			] or
183		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
184					Consonant_Subjoined, Vowel, Vowel_Dependent]))
185def is_BASE_IND(U, UISC, UGC):
186	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
187	return (UISC in [Consonant_Dead, Modifying_Letter] or
188		(UGC == Po and not U in [0x104B, 0x104E, 0x1B5B, 0x1B5C, 0x1B5F, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
189		False # SPEC-DRAFT-OUTDATED! U == 0x002D
190		)
191def is_BASE_NUM(U, UISC, UGC):
192	return UISC == Brahmi_Joining_Number
193def is_BASE_OTHER(U, UISC, UGC):
194	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
195	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
196	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
197def is_CGJ(U, UISC, UGC):
198	return U == 0x034F
199def is_CONS_FINAL(U, UISC, UGC):
200	return ((UISC == Consonant_Final and UGC != Lo) or
201		UISC == Consonant_Succeeding_Repha)
202def is_CONS_FINAL_MOD(U, UISC, UGC):
203	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
204	return  UISC == Syllable_Modifier
205def is_CONS_MED(U, UISC, UGC):
206	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
207	return (UISC == Consonant_Medial and UGC != Lo or
208		UISC == Consonant_Initial_Postfixed)
209def is_CONS_MOD(U, UISC, UGC):
210	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
211def is_CONS_SUB(U, UISC, UGC):
212	#SPEC-DRAFT return UISC == Consonant_Subjoined
213	return UISC == Consonant_Subjoined and UGC != Lo
214def is_CONS_WITH_STACKER(U, UISC, UGC):
215	return UISC == Consonant_With_Stacker
216def is_HALANT(U, UISC, UGC):
217	return (UISC in [Virama, Invisible_Stacker]
218		and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
219		and not is_SAKOT(U, UISC, UGC))
220def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
221	# https://github.com/harfbuzz/harfbuzz/issues/1102
222	# https://github.com/harfbuzz/harfbuzz/issues/1379
223	return U in [0x11046, 0x1134D]
224def is_HALANT_NUM(U, UISC, UGC):
225	return UISC == Number_Joiner
226def is_ZWNJ(U, UISC, UGC):
227	return UISC == Non_Joiner
228def is_ZWJ(U, UISC, UGC):
229	return UISC == Joiner
230def is_Word_Joiner(U, UISC, UGC):
231	return U == 0x2060
232def is_OTHER(U, UISC, UGC):
233	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
234	return (UISC == Other
235		and not is_SYM(U, UISC, UGC)
236		and not is_SYM_MOD(U, UISC, UGC)
237		and not is_CGJ(U, UISC, UGC)
238		and not is_Word_Joiner(U, UISC, UGC)
239		and not is_VARIATION_SELECTOR(U, UISC, UGC)
240	)
241def is_Reserved(U, UISC, UGC):
242	return UGC == 'Cn'
243def is_REPHA(U, UISC, UGC):
244	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
245def is_SAKOT(U, UISC, UGC):
246	return U == 0x1A60
247def is_SYM(U, UISC, UGC):
248	if U == 0x25CC: return False #SPEC-DRAFT
249	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
250	return UGC in [So, Sc] and U not in [0x1B62, 0x1B68]
251def is_SYM_MOD(U, UISC, UGC):
252	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
253def is_VARIATION_SELECTOR(U, UISC, UGC):
254	return 0xFE00 <= U <= 0xFE0F
255def is_VOWEL(U, UISC, UGC):
256	# https://github.com/harfbuzz/harfbuzz/issues/376
257	return (UISC == Pure_Killer or
258		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
259def is_VOWEL_MOD(U, UISC, UGC):
260	# https://github.com/harfbuzz/harfbuzz/issues/376
261	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
262		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
263
264use_mapping = {
265	'B':	is_BASE,
266	'IND':	is_BASE_IND,
267	'N':	is_BASE_NUM,
268	'GB':	is_BASE_OTHER,
269	'CGJ':	is_CGJ,
270	'F':	is_CONS_FINAL,
271	'FM':	is_CONS_FINAL_MOD,
272	'M':	is_CONS_MED,
273	'CM':	is_CONS_MOD,
274	'SUB':	is_CONS_SUB,
275	'CS':	is_CONS_WITH_STACKER,
276	'H':	is_HALANT,
277	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
278	'HN':	is_HALANT_NUM,
279	'ZWNJ':	is_ZWNJ,
280	'ZWJ':	is_ZWJ,
281	'WJ':	is_Word_Joiner,
282	'O':	is_OTHER,
283	'Rsv':	is_Reserved,
284	'R':	is_REPHA,
285	'S':	is_SYM,
286	'Sk':	is_SAKOT,
287	'SM':	is_SYM_MOD,
288	'VS':	is_VARIATION_SELECTOR,
289	'V':	is_VOWEL,
290	'VM':	is_VOWEL_MOD,
291}
292
293use_positions = {
294	'F': {
295		'Abv': [Top],
296		'Blw': [Bottom],
297		'Pst': [Right],
298	},
299	'M': {
300		'Abv': [Top],
301		'Blw': [Bottom, Bottom_And_Left],
302		'Pst': [Right],
303		'Pre': [Left],
304	},
305	'CM': {
306		'Abv': [Top],
307		'Blw': [Bottom],
308	},
309	'V': {
310		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
311		'Blw': [Bottom, Overstruck, Bottom_And_Right],
312		'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
313		'Pre': [Left],
314	},
315	'VM': {
316		'Abv': [Top],
317		'Blw': [Bottom, Overstruck],
318		'Pst': [Right],
319		'Pre': [Left],
320	},
321	'SM': {
322		'Abv': [Top],
323		'Blw': [Bottom],
324	},
325	'H': None,
326	'HVM': None,
327	'B': None,
328	'FM': {
329		'Abv': [Top],
330		'Blw': [Bottom],
331		'Pst': [Not_Applicable],
332	},
333	'SUB': None,
334}
335
336def map_to_use(data):
337	out = {}
338	items = use_mapping.items()
339	for U,(UISC,UIPC,UGC,UBlock) in data.items():
340
341		# Resolve Indic_Syllabic_Category
342
343		# TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
344		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
345
346		# Tibetan:
347		# TODO: These don't have UISC assigned in Unicode 12.0, but have UIPC
348		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
349		if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
350		# Overrides to allow NFC order matching syllable
351		# https://github.com/harfbuzz/harfbuzz/issues/1012
352		if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
353			if UIPC == Top:
354				UIPC = Bottom
355
356		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
357		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
358		if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
359			if UIPC == Top:
360				UIPC = Bottom
361			elif UIPC == Bottom:
362				UIPC = Top
363
364		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
365		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
366
367		# TODO: U+1CED should only be allowed after some of
368		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
369		if U == 0x1CED: UISC = Tone_Mark
370
371		# TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
372		if U == 0x11134: UISC = Gemination_Mark
373
374		values = [k for k,v in items if v(U,UISC,UGC)]
375		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
376		USE = values[0]
377
378		# Resolve Indic_Positional_Category
379
380		# TODO: These should die, but have UIPC in Unicode 12.0
381		if U in [0x953, 0x954]: UIPC = Not_Applicable
382
383		# TODO: In USE's override list but not in Unicode 12.0
384		if U == 0x103C: UIPC = Left
385
386		# TODO: These are not in USE's override list that we have, nor are they in Unicode 12.0
387		if 0xA926 <= U <= 0xA92A: UIPC = Top
388		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
389		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
390		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
391		if U == 0x1171E: UIPC = Left
392		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
393
394		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
395			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
396
397		pos_mapping = use_positions.get(USE, None)
398		if pos_mapping:
399			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
400			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
401			USE = USE + values[0]
402
403		out[U] = (USE, UBlock)
404	return out
405
406defaults = ('O', 'No_Block')
407data = map_to_use(data)
408
409print ("/* == Start of generated table == */")
410print ("/*")
411print (" * The following table is generated by running:")
412print (" *")
413print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
414print (" *")
415print (" * on files with these headers:")
416print (" *")
417for h in headers:
418	for l in h:
419		print (" * %s" % (l.strip()))
420print (" */")
421print ()
422print ('#include "hb-ot-shape-complex-use.hh"')
423print ()
424
425total = 0
426used = 0
427last_block = None
428def print_block (block, start, end, data):
429	global total, used, last_block
430	if block and block != last_block:
431		print ()
432		print ()
433		print ("  /* %s */" % block)
434		if start % 16:
435			print (' ' * (20 + (start % 16 * 6)), end='')
436	num = 0
437	assert start % 8 == 0
438	assert (end+1) % 8 == 0
439	for u in range (start, end+1):
440		if u % 16 == 0:
441			print ()
442			print ("  /* %04X */" % u, end='')
443		if u in data:
444			num += 1
445		d = data.get (u, defaults)
446		print ("%6s," % d[0], end='')
447
448	total += end - start + 1
449	used += num
450	if block:
451		last_block = block
452
453uu = sorted (data.keys ())
454
455last = -100000
456num = 0
457offset = 0
458starts = []
459ends = []
460print ('#pragma GCC diagnostic push')
461print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
462for k,v in sorted(use_mapping.items()):
463	if k in use_positions and use_positions[k]: continue
464	print ("#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:]))
465for k,v in sorted(use_positions.items()):
466	if not v: continue
467	for suf in v.keys():
468		tag = k + suf
469		print ("#define %s	USE_%s" % (tag, tag))
470print ('#pragma GCC diagnostic pop')
471print ("")
472print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
473for u in uu:
474	if u <= last:
475		continue
476	block = data[u][1]
477
478	start = u//8*8
479	end = start+1
480	while end in uu and block == data[end][1]:
481		end += 1
482	end = (end-1)//8*8 + 7
483
484	if start != last + 1:
485		if start - last <= 1+16*3:
486			print_block (None, last+1, start-1, data)
487			last = start-1
488		else:
489			if last >= 0:
490				ends.append (last + 1)
491				offset += ends[-1] - starts[-1]
492			print ()
493			print ()
494			print ("#define use_offset_0x%04xu %d" % (start, offset))
495			starts.append (start)
496
497	print_block (block, start, end, data)
498	last = end
499ends.append (last + 1)
500offset += ends[-1] - starts[-1]
501print ()
502print ()
503occupancy = used * 100. / total
504page_bits = 12
505print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
506print ()
507print ("USE_TABLE_ELEMENT_TYPE")
508print ("hb_use_get_category (hb_codepoint_t u)")
509print ("{")
510print ("  switch (u >> %d)" % page_bits)
511print ("  {")
512pages = set([u>>page_bits for u in starts+ends])
513for p in sorted(pages):
514	print ("    case 0x%0Xu:" % p)
515	for (start,end) in zip (starts, ends):
516		if p not in [start>>page_bits, end>>page_bits]: continue
517		offset = "use_offset_0x%04xu" % start
518		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
519	print ("      break;")
520	print ("")
521print ("    default:")
522print ("      break;")
523print ("  }")
524print ("  return USE_O;")
525print ("}")
526print ()
527for k in sorted(use_mapping.keys()):
528	if k in use_positions and use_positions[k]: continue
529	print ("#undef %s" % k)
530for k,v in sorted(use_positions.items()):
531	if not v: continue
532	for suf in v.keys():
533		tag = k + suf
534		print ("#undef %s" % tag)
535print ()
536print ("/* == End of generated table == */")
537
538# Maintain at least 50% occupancy in the table */
539if occupancy < 50:
540	raise Exception ("Table too sparse, please investigate: ", occupancy)
541