• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# flake8: noqa: F821
3
4"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
5
6Input files:
7* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
9* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
10* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
11* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
12* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
14* ms-use/IndicSyllabicCategory-Additional.txt
15* ms-use/IndicPositionalCategory-Additional.txt
16"""
17
18import sys
19
20if len (sys.argv) != 10:
21	sys.exit (__doc__)
22
23DISABLED_SCRIPTS = {
24	'Arabic',
25	'Lao',
26	'Samaritan',
27	'Syriac',
28	'Thai',
29}
30
31files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
32
33headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
34for j in range(7, 9):
35	for line in files[j]:
36		line = line.rstrip()
37		if not line:
38			break
39		headers[j - 1].append(line)
40headers.append (["UnicodeData.txt does not have a header."])
41
42data = [{} for _ in files]
43values = [{} for _ in files]
44for i, f in enumerate (files):
45	for line in f:
46
47		j = line.find ('#')
48		if j >= 0:
49			line = line[:j]
50
51		fields = [x.strip () for x in line.split (';')]
52		if len (fields) == 1:
53			continue
54
55		uu = fields[0].split ('..')
56		start = int (uu[0], 16)
57		if len (uu) == 1:
58			end = start
59		else:
60			end = int (uu[1], 16)
61
62		t = fields[1 if i not in [2, 4] else 2]
63
64		if i == 2:
65			t = 'jt_' + t
66		elif i == 3 and t != 'Default_Ignorable_Code_Point':
67			continue
68		elif i == 7 and t == 'Consonant_Final_Modifier':
69			# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
70			t = 'Syllable_Modifier'
71		elif i == 8 and t == 'NA':
72			t = 'Not_Applicable'
73
74		i0 = i if i < 7 else i - 7
75		for u in range (start, end + 1):
76			data[i0][u] = t
77		values[i0][t] = values[i0].get (t, 0) + end - start + 1
78
79defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
80
81# TODO Characters that are not in Unicode Indic files, but used in USE
82data[0][0x1B61] = defaults[0]
83data[0][0x1B63] = defaults[0]
84data[0][0x1B64] = defaults[0]
85data[0][0x1B65] = defaults[0]
86data[0][0x1B66] = defaults[0]
87data[0][0x1B67] = defaults[0]
88data[0][0x1B69] = defaults[0]
89data[0][0x1B6A] = defaults[0]
90data[0][0x2060] = defaults[0]
91# TODO https://github.com/harfbuzz/harfbuzz/pull/1685
92data[0][0x1B5B] = 'Consonant_Placeholder'
93data[0][0x1B5C] = 'Consonant_Placeholder'
94data[0][0x1B5F] = 'Consonant_Placeholder'
95data[0][0x1B62] = 'Consonant_Placeholder'
96data[0][0x1B68] = 'Consonant_Placeholder'
97# TODO https://github.com/harfbuzz/harfbuzz/issues/1035
98data[0][0x11C44] = 'Consonant_Placeholder'
99data[0][0x11C45] = 'Consonant_Placeholder'
100# TODO https://github.com/harfbuzz/harfbuzz/pull/1399
101data[0][0x111C8] = 'Consonant_Placeholder'
102
103# Merge data into one dict:
104for i,v in enumerate (defaults):
105	values[i][v] = values[i].get (v, 0) + 1
106combined = {}
107for i,d in enumerate (data):
108	for u,v in d.items ():
109		if not u in combined:
110			if i >= 4:
111				continue
112			combined[u] = list (defaults)
113		combined[u][i] = v
114combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
115data = combined
116del combined
117
118
119property_names = [
120	# General_Category
121	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
122	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
123	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
124	# Indic_Syllabic_Category
125	'Other',
126	'Bindu',
127	'Visarga',
128	'Avagraha',
129	'Nukta',
130	'Virama',
131	'Pure_Killer',
132	'Invisible_Stacker',
133	'Vowel_Independent',
134	'Vowel_Dependent',
135	'Vowel',
136	'Consonant_Placeholder',
137	'Consonant',
138	'Consonant_Dead',
139	'Consonant_With_Stacker',
140	'Consonant_Prefixed',
141	'Consonant_Preceding_Repha',
142	'Consonant_Succeeding_Repha',
143	'Consonant_Subjoined',
144	'Consonant_Medial',
145	'Consonant_Final',
146	'Consonant_Head_Letter',
147	'Consonant_Initial_Postfixed',
148	'Modifying_Letter',
149	'Tone_Letter',
150	'Tone_Mark',
151	'Gemination_Mark',
152	'Cantillation_Mark',
153	'Register_Shifter',
154	'Syllable_Modifier',
155	'Consonant_Killer',
156	'Non_Joiner',
157	'Joiner',
158	'Number_Joiner',
159	'Number',
160	'Brahmi_Joining_Number',
161	'Hieroglyph',
162	'Hieroglyph_Joiner',
163	'Hieroglyph_Segment_Begin',
164	'Hieroglyph_Segment_End',
165	# Indic_Positional_Category
166	'Not_Applicable',
167	'Right',
168	'Left',
169	'Visual_Order_Left',
170	'Left_And_Right',
171	'Top',
172	'Bottom',
173	'Top_And_Bottom',
174	'Top_And_Bottom_And_Left',
175	'Top_And_Right',
176	'Top_And_Left',
177	'Top_And_Left_And_Right',
178	'Bottom_And_Left',
179	'Bottom_And_Right',
180	'Top_And_Bottom_And_Right',
181	'Overstruck',
182	# Joining_Type
183	'jt_C',
184	'jt_D',
185	'jt_L',
186	'jt_R',
187	'jt_T',
188	'jt_U',
189	'jt_X',
190]
191
192class PropertyValue(object):
193	def __init__(self, name_):
194		self.name = name_
195	def __str__(self):
196		return self.name
197	def __eq__(self, other):
198		return self.name == (other if isinstance(other, str) else other.name)
199	def __ne__(self, other):
200		return not (self == other)
201	def __hash__(self):
202		return hash(str(self))
203
204property_values = {}
205
206for name in property_names:
207	value = PropertyValue(name)
208	assert value not in property_values
209	assert value not in globals()
210	property_values[name] = value
211globals().update(property_values)
212
213
214def is_BASE(U, UISC, UDI, UGC, AJT):
215	return (UISC in [Number, Consonant, Consonant_Head_Letter,
216			Tone_Letter,
217			Vowel_Independent,
218			] or
219		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
220		AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
221		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
222					Consonant_Subjoined, Vowel, Vowel_Dependent]))
223def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
224	return UISC == Brahmi_Joining_Number
225def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
226	if UISC == Consonant_Placeholder: return True
227	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
228def is_CGJ(U, UISC, UDI, UGC, AJT):
229	# Also includes VARIATION_SELECTOR, WJ, and ZWJ
230	return U == 0x200D or UDI and UGC in [Mc, Me, Mn]
231def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
232	return ((UISC == Consonant_Final and UGC != Lo) or
233		UISC == Consonant_Succeeding_Repha)
234def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
235	return UISC == Syllable_Modifier
236def is_CONS_MED(U, UISC, UDI, UGC, AJT):
237	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
238	return (UISC == Consonant_Medial and UGC != Lo or
239		UISC == Consonant_Initial_Postfixed)
240def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
241	return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and
242		not is_SYM_MOD(U, UISC, UDI, UGC, AJT))
243def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
244	return UISC == Consonant_Subjoined and UGC != Lo
245def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
246	return UISC == Consonant_With_Stacker
247def is_HALANT(U, UISC, UDI, UGC, AJT):
248	return (UISC in [Virama, Invisible_Stacker]
249		and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
250		and not is_SAKOT(U, UISC, UDI, UGC, AJT))
251def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
252	# Split off of HALANT
253	# https://github.com/harfbuzz/harfbuzz/issues/1379
254	return U == 0x1134D
255def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
256	return UISC == Number_Joiner
257def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
258	return UISC == Hieroglyph
259def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
260	return UISC == Hieroglyph_Joiner
261def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
262	return UISC == Hieroglyph_Segment_Begin
263def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
264	return UISC == Hieroglyph_Segment_End
265def is_ZWNJ(U, UISC, UDI, UGC, AJT):
266	return UISC == Non_Joiner
267def is_OTHER(U, UISC, UDI, UGC, AJT):
268	# Also includes BASE_IND, Rsv, and SYM
269	return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
270		and not is_BASE(U, UISC, UDI, UGC, AJT)
271		and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
272		and not is_CGJ(U, UISC, UDI, UGC, AJT)
273		and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
274	)
275def is_REPHA(U, UISC, UDI, UGC, AJT):
276	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
277def is_SAKOT(U, UISC, UDI, UGC, AJT):
278	# Split off of HALANT
279	return U == 0x1A60
280def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
281	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
282def is_VOWEL(U, UISC, UDI, UGC, AJT):
283	# https://github.com/harfbuzz/harfbuzz/issues/376
284	return (UISC == Pure_Killer or
285		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
286def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
287	# https://github.com/harfbuzz/harfbuzz/issues/376
288	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
289		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
290
291use_mapping = {
292	'B':	is_BASE,
293	'N':	is_BASE_NUM,
294	'GB':	is_BASE_OTHER,
295	'CGJ':	is_CGJ,
296	'F':	is_CONS_FINAL,
297	'FM':	is_CONS_FINAL_MOD,
298	'M':	is_CONS_MED,
299	'CM':	is_CONS_MOD,
300	'SUB':	is_CONS_SUB,
301	'CS':	is_CONS_WITH_STACKER,
302	'H':	is_HALANT,
303	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
304	'HN':	is_HALANT_NUM,
305	'G':	is_HIEROGLYPH,
306	'J':	is_HIEROGLYPH_JOINER,
307	'SB':	is_HIEROGLYPH_SEGMENT_BEGIN,
308	'SE':	is_HIEROGLYPH_SEGMENT_END,
309	'ZWNJ':	is_ZWNJ,
310	'O':	is_OTHER,
311	'R':	is_REPHA,
312	'Sk':	is_SAKOT,
313	'SM':	is_SYM_MOD,
314	'V':	is_VOWEL,
315	'VM':	is_VOWEL_MOD,
316}
317
318use_positions = {
319	'F': {
320		'Abv': [Top],
321		'Blw': [Bottom],
322		'Pst': [Right],
323	},
324	'M': {
325		'Abv': [Top],
326		'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
327		'Pst': [Right],
328		'Pre': [Left, Top_And_Bottom_And_Left],
329	},
330	'CM': {
331		'Abv': [Top],
332		'Blw': [Bottom, Overstruck],
333	},
334	'V': {
335		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
336		'Blw': [Bottom, Overstruck, Bottom_And_Right],
337		'Pst': [Right],
338		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
339	},
340	'VM': {
341		'Abv': [Top],
342		'Blw': [Bottom, Overstruck],
343		'Pst': [Right],
344		'Pre': [Left],
345	},
346	'SM': {
347		'Abv': [Top],
348		'Blw': [Bottom],
349	},
350	'H': None,
351	'HVM': None,
352	'B': None,
353	'FM': {
354		'Abv': [Top],
355		'Blw': [Bottom],
356		'Pst': [Not_Applicable],
357	},
358	'R': None,
359	'SUB': None,
360}
361
362def map_to_use(data):
363	out = {}
364	items = use_mapping.items()
365	for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
366
367		# Resolve Indic_Syllabic_Category
368
369		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
370		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
371
372		# Tibetan:
373		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
374		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
375
376		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
377		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
378
379		# TODO: U+1CED should only be allowed after some of
380		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
381		if U == 0x1CED: UISC = Tone_Mark
382
383		# TODO: https://github.com/microsoft/font-tools/issues/1
384		if U == 0xA982: UISC = Consonant_Succeeding_Repha
385
386		values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
387		assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
388		USE = values[0]
389
390		# Resolve Indic_Positional_Category
391
392		# TODO: These should die, but have UIPC in Unicode 13.0.0
393		if U in [0x953, 0x954]: UIPC = Not_Applicable
394
395		# TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0
396		if 0xA926 <= U <= 0xA92A: UIPC = Top
397		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
398		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
399		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
400		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
401
402		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
403		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
404		if 0x1112A <= U <= 0x1112B: UIPC = Top
405		if 0x11131 <= U <= 0x11132: UIPC = Top
406
407		assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
408			USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
409
410		pos_mapping = use_positions.get(USE, None)
411		if pos_mapping:
412			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
413			assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
414			USE = USE + values[0]
415
416		out[U] = (USE, UBlock)
417	return out
418
419defaults = ('O', 'No_Block')
420data = map_to_use(data)
421
422print ("/* == Start of generated table == */")
423print ("/*")
424print (" * The following table is generated by running:")
425print (" *")
426print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
427print (" *")
428print (" * on files with these headers:")
429print (" *")
430for h in headers:
431	for l in h:
432		print (" * %s" % (l.strip()))
433print (" */")
434print ()
435print ("#ifndef HB_OT_SHAPE_COMPLEX_USE_TABLE_HH")
436print ("#define HB_OT_SHAPE_COMPLEX_USE_TABLE_HH")
437print ()
438print ('#include "hb.hh"')
439print ()
440print ('#include "hb-ot-shape-complex-use-machine.hh"')
441print ()
442
443total = 0
444used = 0
445last_block = None
446def print_block (block, start, end, data):
447	global total, used, last_block
448	if block and block != last_block:
449		print ()
450		print ()
451		print ("  /* %s */" % block)
452		if start % 16:
453			print (' ' * (20 + (start % 16 * 6)), end='')
454	num = 0
455	assert start % 8 == 0
456	assert (end+1) % 8 == 0
457	for u in range (start, end+1):
458		if u % 16 == 0:
459			print ()
460			print ("  /* %04X */" % u, end='')
461		if u in data:
462			num += 1
463		d = data.get (u, defaults)
464		print ("%6s," % d[0], end='')
465
466	total += end - start + 1
467	used += num
468	if block:
469		last_block = block
470
471uu = sorted (data.keys ())
472
473last = -100000
474num = 0
475offset = 0
476starts = []
477ends = []
478print ('#pragma GCC diagnostic push')
479print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
480for k,v in sorted(use_mapping.items()):
481	if k in use_positions and use_positions[k]: continue
482	print ("#define %s	USE(%s)	/* %s */" % (k, k, v.__name__[3:]))
483for k,v in sorted(use_positions.items()):
484	if not v: continue
485	for suf in v.keys():
486		tag = k + suf
487		print ("#define %s	USE(%s)" % (tag, tag))
488print ('#pragma GCC diagnostic pop')
489print ("")
490print ("static const uint8_t use_table[] = {")
491for u in uu:
492	if u <= last:
493		continue
494	if data[u][0] == 'O':
495		continue
496	block = data[u][1]
497
498	start = u//8*8
499	end = start+1
500	while end in uu and block == data[end][1]:
501		end += 1
502	end = (end-1)//8*8 + 7
503
504	if start != last + 1:
505		if start - last <= 1+16*3:
506			print_block (None, last+1, start-1, data)
507		else:
508			if last >= 0:
509				ends.append (last + 1)
510				offset += ends[-1] - starts[-1]
511			print ()
512			print ()
513			print ("#define use_offset_0x%04xu %d" % (start, offset))
514			starts.append (start)
515
516	print_block (block, start, end, data)
517	last = end
518ends.append (last + 1)
519offset += ends[-1] - starts[-1]
520print ()
521print ()
522occupancy = used * 100. / total
523page_bits = 12
524print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
525print ()
526print ("static inline uint8_t")
527print ("hb_use_get_category (hb_codepoint_t u)")
528print ("{")
529print ("  switch (u >> %d)" % page_bits)
530print ("  {")
531pages = set([u>>page_bits for u in starts+ends])
532for p in sorted(pages):
533	print ("    case 0x%0Xu:" % p)
534	for (start,end) in zip (starts, ends):
535		if p not in [start>>page_bits, end>>page_bits]: continue
536		offset = "use_offset_0x%04xu" % start
537		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
538	print ("      break;")
539	print ("")
540print ("    default:")
541print ("      break;")
542print ("  }")
543print ("  return USE(O);")
544print ("}")
545print ()
546for k in sorted(use_mapping.keys()):
547	if k in use_positions and use_positions[k]: continue
548	print ("#undef %s" % k)
549for k,v in sorted(use_positions.items()):
550	if not v: continue
551	for suf in v.keys():
552		tag = k + suf
553		print ("#undef %s" % tag)
554print ()
555print ()
556print ("#endif /* HB_OT_SHAPE_COMPLEX_USE_TABLE_HH */")
557print ("/* == End of generated table == */")
558
559# Maintain at least 50% occupancy in the table */
560if occupancy < 50:
561	raise Exception ("Table too sparse, please investigate: ", occupancy)
562