• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# flake8: noqa: F821
3
4import logging
5logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
6
7"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
8
9Input files:
10* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
11* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
12* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
13* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
14* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
15* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
16* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
17* ms-use/IndicSyllabicCategory-Additional.txt
18* ms-use/IndicPositionalCategory-Additional.txt
19"""
20
21import sys
22
23if len (sys.argv) != 10:
24	sys.exit (__doc__)
25
26DISABLED_SCRIPTS = {
27	'Arabic',
28	'Lao',
29	'Samaritan',
30	'Syriac',
31	'Thai',
32}
33
34files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
35
36headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
37for j in range(7, 9):
38	for line in files[j]:
39		line = line.rstrip()
40		if not line:
41			break
42		headers[j - 1].append(line)
43headers.append (["UnicodeData.txt does not have a header."])
44
45unicode_data = [{} for _ in files]
46values = [{} for _ in files]
47for i, f in enumerate (files):
48	for line in f:
49
50		j = line.find ('#')
51		if j >= 0:
52			line = line[:j]
53
54		fields = [x.strip () for x in line.split (';')]
55		if len (fields) == 1:
56			continue
57
58		uu = fields[0].split ('..')
59		start = int (uu[0], 16)
60		if len (uu) == 1:
61			end = start
62		else:
63			end = int (uu[1], 16)
64
65		t = fields[1 if i not in [2, 4] else 2]
66
67		if i == 2:
68			t = 'jt_' + t
69		elif i == 3 and t != 'Default_Ignorable_Code_Point':
70			continue
71		elif i == 7 and t == 'Consonant_Final_Modifier':
72			# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
73			t = 'Syllable_Modifier'
74		elif i == 8 and t == 'NA':
75			t = 'Not_Applicable'
76
77		i0 = i if i < 7 else i - 7
78		for u in range (start, end + 1):
79			unicode_data[i0][u] = t
80		values[i0][t] = values[i0].get (t, 0) + end - start + 1
81
82defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
83
84# Merge data into one dict:
85for i,v in enumerate (defaults):
86	values[i][v] = values[i].get (v, 0) + 1
87combined = {}
88for i,d in enumerate (unicode_data):
89	for u,v in d.items ():
90		if not u in combined:
91			if i >= 4:
92				continue
93			combined[u] = list (defaults)
94		combined[u][i] = v
95combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
96
97
98property_names = [
99	# General_Category
100	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
101	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
102	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
103	# Indic_Syllabic_Category
104	'Other',
105	'Bindu',
106	'Visarga',
107	'Avagraha',
108	'Nukta',
109	'Virama',
110	'Pure_Killer',
111	'Invisible_Stacker',
112	'Vowel_Independent',
113	'Vowel_Dependent',
114	'Vowel',
115	'Consonant_Placeholder',
116	'Consonant',
117	'Consonant_Dead',
118	'Consonant_With_Stacker',
119	'Consonant_Prefixed',
120	'Consonant_Preceding_Repha',
121	'Consonant_Succeeding_Repha',
122	'Consonant_Subjoined',
123	'Consonant_Medial',
124	'Consonant_Final',
125	'Consonant_Head_Letter',
126	'Consonant_Initial_Postfixed',
127	'Modifying_Letter',
128	'Tone_Letter',
129	'Tone_Mark',
130	'Gemination_Mark',
131	'Cantillation_Mark',
132	'Register_Shifter',
133	'Syllable_Modifier',
134	'Consonant_Killer',
135	'Non_Joiner',
136	'Joiner',
137	'Number_Joiner',
138	'Number',
139	'Brahmi_Joining_Number',
140	'Symbol_Modifier',
141	'Hieroglyph',
142	'Hieroglyph_Joiner',
143	'Hieroglyph_Mark_Begin',
144	'Hieroglyph_Mark_End',
145	'Hieroglyph_Mirror',
146	'Hieroglyph_Modifier',
147	'Hieroglyph_Segment_Begin',
148	'Hieroglyph_Segment_End',
149	# Indic_Positional_Category
150	'Not_Applicable',
151	'Right',
152	'Left',
153	'Visual_Order_Left',
154	'Left_And_Right',
155	'Top',
156	'Bottom',
157	'Top_And_Bottom',
158	'Top_And_Bottom_And_Left',
159	'Top_And_Right',
160	'Top_And_Left',
161	'Top_And_Left_And_Right',
162	'Bottom_And_Left',
163	'Bottom_And_Right',
164	'Top_And_Bottom_And_Right',
165	'Overstruck',
166	# Joining_Type
167	'jt_C',
168	'jt_D',
169	'jt_L',
170	'jt_R',
171	'jt_T',
172	'jt_U',
173	'jt_X',
174]
175
176class PropertyValue(object):
177	def __init__(self, name_):
178		self.name = name_
179	def __str__(self):
180		return self.name
181	def __eq__(self, other):
182		return self.name == (other if isinstance(other, str) else other.name)
183	def __ne__(self, other):
184		return not (self == other)
185	def __hash__(self):
186		return hash(str(self))
187
188property_values = {}
189
190for name in property_names:
191	value = PropertyValue(name)
192	assert value not in property_values
193	assert value not in globals()
194	property_values[name] = value
195globals().update(property_values)
196
197
198def is_BASE(U, UISC, UDI, UGC, AJT):
199	return (UISC in [Number, Consonant, Consonant_Head_Letter,
200			Tone_Letter,
201			Vowel_Independent,
202			] or
203		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
204		AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
205		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
206					Consonant_Subjoined, Vowel, Vowel_Dependent]))
207def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
208	return UISC == Brahmi_Joining_Number
209def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
210	if UISC == Consonant_Placeholder: return True
211	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
212def is_CGJ(U, UISC, UDI, UGC, AJT):
213	# Also includes VARIATION_SELECTOR and ZWJ
214	return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
215def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
216	return ((UISC == Consonant_Final and UGC != Lo) or
217		UISC == Consonant_Succeeding_Repha)
218def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
219	return UISC == Syllable_Modifier
220def is_CONS_MED(U, UISC, UDI, UGC, AJT):
221	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
222	return (UISC == Consonant_Medial and UGC != Lo or
223		UISC == Consonant_Initial_Postfixed)
224def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
225	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
226def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
227	return UISC == Consonant_Subjoined and UGC != Lo
228def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
229	return UISC == Consonant_With_Stacker
230def is_HALANT(U, UISC, UDI, UGC, AJT):
231	return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
232def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
233	# Split off of HALANT
234	return U == 0x0DCA
235def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
236	return UISC == Number_Joiner
237def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
238	return UISC == Hieroglyph
239def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
240	return UISC == Hieroglyph_Joiner
241def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
242	return UISC == Hieroglyph_Mirror
243def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
244	return UISC == Hieroglyph_Modifier
245def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
246	return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
247def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
248	return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
249def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
250	# Split off of HALANT
251	return (UISC == Invisible_Stacker
252		and not is_SAKOT(U, UISC, UDI, UGC, AJT)
253	)
254def is_ZWNJ(U, UISC, UDI, UGC, AJT):
255	return UISC == Non_Joiner
256def is_OTHER(U, UISC, UDI, UGC, AJT):
257	# Also includes BASE_IND and SYM
258	return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
259		and not is_BASE(U, UISC, UDI, UGC, AJT)
260		and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
261		and not is_CGJ(U, UISC, UDI, UGC, AJT)
262		and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
263		and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
264	)
265def is_REPHA(U, UISC, UDI, UGC, AJT):
266	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
267def is_SAKOT(U, UISC, UDI, UGC, AJT):
268	# Split off of HALANT
269	return U == 0x1A60
270def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
271	return UISC == Symbol_Modifier
272def is_VOWEL(U, UISC, UDI, UGC, AJT):
273	return (UISC == Pure_Killer or
274		UGC != Lo and UISC in [Vowel, Vowel_Dependent])
275def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
276	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
277		UGC != Lo and UISC == Bindu)
278def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
279	# Also includes Rsv
280	return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
281		and UISC == Other
282		and not is_CGJ(U, UISC, UDI, UGC, AJT)
283	) or UGC == Cn
284
285use_mapping = {
286	'B':	is_BASE,
287	'N':	is_BASE_NUM,
288	'GB':	is_BASE_OTHER,
289	'CGJ':	is_CGJ,
290	'F':	is_CONS_FINAL,
291	'FM':	is_CONS_FINAL_MOD,
292	'M':	is_CONS_MED,
293	'CM':	is_CONS_MOD,
294	'SUB':	is_CONS_SUB,
295	'CS':	is_CONS_WITH_STACKER,
296	'H':	is_HALANT,
297	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
298	'HN':	is_HALANT_NUM,
299	'IS':	is_INVISIBLE_STACKER,
300	'G':	is_HIEROGLYPH,
301	'HM':	is_HIEROGLYPH_MOD,
302	'HR':	is_HIEROGLYPH_MIRROR,
303	'J':	is_HIEROGLYPH_JOINER,
304	'SB':	is_HIEROGLYPH_SEGMENT_BEGIN,
305	'SE':	is_HIEROGLYPH_SEGMENT_END,
306	'ZWNJ':	is_ZWNJ,
307	'O':	is_OTHER,
308	'R':	is_REPHA,
309	'Sk':	is_SAKOT,
310	'SM':	is_SYM_MOD,
311	'V':	is_VOWEL,
312	'VM':	is_VOWEL_MOD,
313	'WJ':	is_Word_Joiner,
314}
315
316use_positions = {
317	'F': {
318		'Abv': [Top],
319		'Blw': [Bottom],
320		'Pst': [Right],
321	},
322	'M': {
323		'Abv': [Top],
324		'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
325		'Pst': [Right],
326		'Pre': [Left, Top_And_Bottom_And_Left],
327	},
328	'CM': {
329		'Abv': [Top],
330		'Blw': [Bottom, Overstruck],
331	},
332	'V': {
333		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
334		'Blw': [Bottom, Overstruck, Bottom_And_Right],
335		'Pst': [Right],
336		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
337	},
338	'VM': {
339		'Abv': [Top],
340		'Blw': [Bottom, Overstruck],
341		'Pst': [Right],
342		'Pre': [Left],
343	},
344	'SM': {
345		'Abv': [Top],
346		'Blw': [Bottom],
347	},
348	'H': None,
349	'HM': None,
350	'HR': None,
351	'HVM': None,
352	'IS': None,
353	'B': None,
354	'FM': {
355		'Abv': [Top],
356		'Blw': [Bottom],
357		'Pst': [Not_Applicable],
358	},
359	'R': None,
360	'SUB': None,
361}
362
363def map_to_use(data):
364	out = {}
365	items = use_mapping.items()
366	for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
367
368		# Resolve Indic_Syllabic_Category
369
370		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
371		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
372
373		# Tibetan:
374		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
375		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
376
377		# TODO: U+1CED should only be allowed after some of
378		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
379		if U == 0x1CED: UISC = Tone_Mark
380
381		values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
382		assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
383		USE = values[0]
384
385		# Resolve Indic_Positional_Category
386
387		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
388		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
389		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
390
391		assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
392			USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
393
394		pos_mapping = use_positions.get(USE, None)
395		if pos_mapping:
396			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
397			assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
398			USE = USE + values[0]
399
400		out[U] = (USE, UBlock)
401	return out
402
403use_data = map_to_use(combined)
404
405print ("/* == Start of generated table == */")
406print ("/*")
407print (" * The following table is generated by running:")
408print (" *")
409print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
410print (" *")
411print (" * on files with these headers:")
412print (" *")
413for h in headers:
414	for l in h:
415		print (" * %s" % (l.strip()))
416print (" */")
417print ()
418print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
419print ("#define HB_OT_SHAPER_USE_TABLE_HH")
420print ()
421print ('#include "hb.hh"')
422print ()
423print ('#include "hb-ot-shaper-use-machine.hh"')
424print ()
425
426total = 0
427used = 0
428last_block = None
429def print_block (block, start, end, use_data):
430	global total, used, last_block
431	if block and block != last_block:
432		print ()
433		print ()
434		print ("  /* %s */" % block)
435		if start % 16:
436			print (' ' * (20 + (start % 16 * 6)), end='')
437	num = 0
438	assert start % 8 == 0
439	assert (end+1) % 8 == 0
440	for u in range (start, end+1):
441		if u % 16 == 0:
442			print ()
443			print ("  /* %04X */" % u, end='')
444		if u in use_data:
445			num += 1
446		d = use_data.get (u)
447		if d is not None:
448			d = d[0]
449		elif u in unicode_data[4]:
450			d = 'O'
451		else:
452			d = 'WJ'
453		print ("%6s," % d, end='')
454
455	total += end - start + 1
456	used += num
457	if block:
458		last_block = block
459
460uu = sorted (use_data.keys ())
461
462last = -100000
463num = 0
464offset = 0
465starts = []
466ends = []
467print ('#pragma GCC diagnostic push')
468print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
469for k,v in sorted(use_mapping.items()):
470	if k in use_positions and use_positions[k]: continue
471	print ("#define %s	USE(%s)	/* %s */" % (k, k, v.__name__[3:]))
472for k,v in sorted(use_positions.items()):
473	if not v: continue
474	for suf in v.keys():
475		tag = k + suf
476		print ("#define %s	USE(%s)" % (tag, tag))
477print ('#pragma GCC diagnostic pop')
478print ("")
479
480
481import packTab
482data = {u:v[0] for u,v in use_data.items()}
483
484DEFAULT = 5
485COMPACT = 9
486for compression in (DEFAULT, COMPACT):
487
488    logging.info('  Compression=%d:' % compression)
489    print()
490    if compression == DEFAULT:
491        print('#ifndef HB_OPTIMIZE_SIZE')
492    elif compression == COMPACT:
493        print('#else')
494    else:
495        assert False
496    print()
497
498    code = packTab.Code('hb_use')
499    sol = packTab.pack_table(data, compression=compression, default='O')
500    logging.info('      FullCost=%d' % (sol.fullCost))
501    sol.genCode(code, f'get_category')
502    code.print_c(linkage='static inline')
503    print ()
504
505print('#endif')
506
507print ()
508for k in sorted(use_mapping.keys()):
509	if k in use_positions and use_positions[k]: continue
510	print ("#undef %s" % k)
511for k,v in sorted(use_positions.items()):
512	if not v: continue
513	for suf in v.keys():
514		tag = k + suf
515		print ("#undef %s" % tag)
516print ()
517print ()
518print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
519print ("/* == End of generated table == */")
520