• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2
3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice
4versa.
5
6It creates a ``const LangTag[]``, matching the tags from the OpenType
7languages system tag list to the language subtags of the BCP 47 language
8subtag registry, with some manual adjustments. The mappings are
9supplemented with macrolanguages' sublanguages and retired codes'
10replacements, according to BCP 47 and some manual additions where BCP 47
11omits a retired code entirely.
12
13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16multiple BCP 47 tags) are listed here, except when the alphabetically
17first BCP 47 tag happens to be the chosen disambiguated tag. In that
18case, the fallback behavior will choose the right tag anyway.
19
20usage: ./gen-tag-table.py languagetags language-subtag-registry
21
22Input files:
23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
25"""
26
27import collections
28import html
29from html.parser import HTMLParser
30import itertools
31import re
32import sys
33import unicodedata
34
35if len (sys.argv) != 3:
36	sys.exit (__doc__)
37
38def expect (condition, message=None):
39	if not condition:
40		if message is None:
41			raise AssertionError
42		raise AssertionError (message)
43
44def write (s):
45	sys.stdout.flush ()
46	sys.stdout.buffer.write (s.encode ('utf-8'))
47
48DEFAULT_LANGUAGE_SYSTEM = ''
49
50# from https://www-01.sil.org/iso639-3/iso-639-3.tab
51ISO_639_3_TO_1 = {
52	'aar': 'aa',
53	'abk': 'ab',
54	'afr': 'af',
55	'aka': 'ak',
56	'amh': 'am',
57	'ara': 'ar',
58	'arg': 'an',
59	'asm': 'as',
60	'ava': 'av',
61	'ave': 'ae',
62	'aym': 'ay',
63	'aze': 'az',
64	'bak': 'ba',
65	'bam': 'bm',
66	'bel': 'be',
67	'ben': 'bn',
68	'bis': 'bi',
69	'bod': 'bo',
70	'bos': 'bs',
71	'bre': 'br',
72	'bul': 'bg',
73	'cat': 'ca',
74	'ces': 'cs',
75	'cha': 'ch',
76	'che': 'ce',
77	'chu': 'cu',
78	'chv': 'cv',
79	'cor': 'kw',
80	'cos': 'co',
81	'cre': 'cr',
82	'cym': 'cy',
83	'dan': 'da',
84	'deu': 'de',
85	'div': 'dv',
86	'dzo': 'dz',
87	'ell': 'el',
88	'eng': 'en',
89	'epo': 'eo',
90	'est': 'et',
91	'eus': 'eu',
92	'ewe': 'ee',
93	'fao': 'fo',
94	'fas': 'fa',
95	'fij': 'fj',
96	'fin': 'fi',
97	'fra': 'fr',
98	'fry': 'fy',
99	'ful': 'ff',
100	'gla': 'gd',
101	'gle': 'ga',
102	'glg': 'gl',
103	'glv': 'gv',
104	'grn': 'gn',
105	'guj': 'gu',
106	'hat': 'ht',
107	'hau': 'ha',
108	'hbs': 'sh',
109	'heb': 'he',
110	'her': 'hz',
111	'hin': 'hi',
112	'hmo': 'ho',
113	'hrv': 'hr',
114	'hun': 'hu',
115	'hye': 'hy',
116	'ibo': 'ig',
117	'ido': 'io',
118	'iii': 'ii',
119	'iku': 'iu',
120	'ile': 'ie',
121	'ina': 'ia',
122	'ind': 'id',
123	'ipk': 'ik',
124	'isl': 'is',
125	'ita': 'it',
126	'jav': 'jv',
127	'jpn': 'ja',
128	'kal': 'kl',
129	'kan': 'kn',
130	'kas': 'ks',
131	'kat': 'ka',
132	'kau': 'kr',
133	'kaz': 'kk',
134	'khm': 'km',
135	'kik': 'ki',
136	'kin': 'rw',
137	'kir': 'ky',
138	'kom': 'kv',
139	'kon': 'kg',
140	'kor': 'ko',
141	'kua': 'kj',
142	'kur': 'ku',
143	'lao': 'lo',
144	'lat': 'la',
145	'lav': 'lv',
146	'lim': 'li',
147	'lin': 'ln',
148	'lit': 'lt',
149	'ltz': 'lb',
150	'lub': 'lu',
151	'lug': 'lg',
152	'mah': 'mh',
153	'mal': 'ml',
154	'mar': 'mr',
155	'mkd': 'mk',
156	'mlg': 'mg',
157	'mlt': 'mt',
158	'mol': 'mo',
159	'mon': 'mn',
160	'mri': 'mi',
161	'msa': 'ms',
162	'mya': 'my',
163	'nau': 'na',
164	'nav': 'nv',
165	'nbl': 'nr',
166	'nde': 'nd',
167	'ndo': 'ng',
168	'nep': 'ne',
169	'nld': 'nl',
170	'nno': 'nn',
171	'nob': 'nb',
172	'nor': 'no',
173	'nya': 'ny',
174	'oci': 'oc',
175	'oji': 'oj',
176	'ori': 'or',
177	'orm': 'om',
178	'oss': 'os',
179	'pan': 'pa',
180	'pli': 'pi',
181	'pol': 'pl',
182	'por': 'pt',
183	'pus': 'ps',
184	'que': 'qu',
185	'roh': 'rm',
186	'ron': 'ro',
187	'run': 'rn',
188	'rus': 'ru',
189	'sag': 'sg',
190	'san': 'sa',
191	'sin': 'si',
192	'slk': 'sk',
193	'slv': 'sl',
194	'sme': 'se',
195	'smo': 'sm',
196	'sna': 'sn',
197	'snd': 'sd',
198	'som': 'so',
199	'sot': 'st',
200	'spa': 'es',
201	'sqi': 'sq',
202	'srd': 'sc',
203	'srp': 'sr',
204	'ssw': 'ss',
205	'sun': 'su',
206	'swa': 'sw',
207	'swe': 'sv',
208	'tah': 'ty',
209	'tam': 'ta',
210	'tat': 'tt',
211	'tel': 'te',
212	'tgk': 'tg',
213	'tgl': 'tl',
214	'tha': 'th',
215	'tir': 'ti',
216	'ton': 'to',
217	'tsn': 'tn',
218	'tso': 'ts',
219	'tuk': 'tk',
220	'tur': 'tr',
221	'twi': 'tw',
222	'uig': 'ug',
223	'ukr': 'uk',
224	'urd': 'ur',
225	'uzb': 'uz',
226	'ven': 've',
227	'vie': 'vi',
228	'vol': 'vo',
229	'wln': 'wa',
230	'wol': 'wo',
231	'xho': 'xh',
232	'yid': 'yi',
233	'yor': 'yo',
234	'zha': 'za',
235	'zho': 'zh',
236	'zul': 'zu',
237}
238
239class LanguageTag (object):
240	"""A BCP 47 language tag.
241
242	Attributes:
243		subtags (List[str]): The list of subtags in this tag.
244		grandfathered (bool): Whether this tag is grandfathered. If
245			``true``, the entire lowercased tag is the ``language``
246			and the other subtag fields are empty.
247		language (str): The language subtag.
248		script (str): The script subtag.
249		region (str): The region subtag.
250		variant (str): The variant subtag.
251
252	Args:
253		tag (str): A BCP 47 language tag.
254
255	"""
256	def __init__ (self, tag):
257		global bcp_47
258		self.subtags = tag.lower ().split ('-')
259		self.grandfathered = tag.lower () in bcp_47.grandfathered
260		if self.grandfathered:
261			self.language = tag.lower ()
262			self.script = ''
263			self.region = ''
264			self.variant = ''
265		else:
266			self.language = self.subtags[0]
267			self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
268			self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
269			self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
270
271	def __str__(self):
272		return '-'.join(self.subtags)
273
274	def __repr__ (self):
275		return 'LanguageTag(%r)' % str(self)
276
277	@staticmethod
278	def _find_first (function, sequence):
279		try:
280			return next (iter (filter (function, sequence)))
281		except StopIteration:
282			return None
283
284	def is_complex (self):
285		"""Return whether this tag is too complex to represent as a
286		``LangTag`` in the generated code.
287
288		Complex tags need to be handled in
289		``hb_ot_tags_from_complex_language``.
290
291		Returns:
292			Whether this tag is complex.
293		"""
294		return not (len (self.subtags) == 1
295			or self.grandfathered
296			and len (self.subtags[1]) != 3
297			and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
298
299	def get_group (self):
300		"""Return the group into which this tag should be categorized in
301		``hb_ot_tags_from_complex_language``.
302
303		The group is the first letter of the tag, or ``'und'`` if this tag
304		should not be matched in a ``switch`` statement in the generated
305		code.
306
307		Returns:
308			This tag's group.
309		"""
310		return ('und'
311			if (self.language == 'und'
312				or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
313			else self.language[0])
314
315class OpenTypeRegistryParser (HTMLParser):
316	"""A parser for the OpenType language system tag registry.
317
318	Attributes:
319		header (str): The "last updated" line of the registry.
320		names (Mapping[str, str]): A map of language system tags to the
321			names they are given in the registry.
322		ranks (DefaultDict[str, int]): A map of language system tags to
323			numbers. If a single BCP 47 tag corresponds to multiple
324			OpenType tags, the tags are ordered in increasing order by
325			rank. The rank is based on the number of BCP 47 tags
326			associated with a tag, though it may be manually modified.
327		to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
328			OpenType language system tags to sets of BCP 47 tags.
329		from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
330			inverted. Its values start as unsorted sets;
331			``sort_languages`` converts them to sorted lists.
332
333	"""
334	def __init__ (self):
335		HTMLParser.__init__ (self)
336		self.header = ''
337		self.names = {}
338		self.ranks = collections.defaultdict (int)
339		self.to_bcp_47 = collections.defaultdict (set)
340		self.from_bcp_47 = collections.defaultdict (set)
341		# Whether the parser is in a <td> element
342		self._td = False
343		# The text of the <td> elements of the current <tr> element.
344		self._current_tr = []
345
346	def handle_starttag (self, tag, attrs):
347		if tag == 'meta':
348			for attr, value in attrs:
349				if attr == 'name' and value == 'updated_at':
350					self.header = self.get_starttag_text ()
351					break
352		elif tag == 'td':
353			self._td = True
354			self._current_tr.append ('')
355		elif tag == 'tr':
356			self._current_tr = []
357
358	def handle_endtag (self, tag):
359		if tag == 'td':
360			self._td = False
361		elif tag == 'tr' and self._current_tr:
362			expect (2 <= len (self._current_tr) <= 3)
363			name = self._current_tr[0].strip ()
364			tag = self._current_tr[1].strip ("\t\n\v\f\r '")
365			rank = 0
366			if len (tag) > 4:
367				expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
368				name += ' (deprecated)'
369				tag = tag.split (' ')[0]
370				rank = 1
371			self.names[tag] = re.sub (' languages$', '', name)
372			if not self._current_tr[2]:
373				return
374			iso_codes = self._current_tr[2].strip ()
375			self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
376			rank += 2 * len (self.to_bcp_47[tag])
377			self.ranks[tag] = rank
378
379	def handle_data (self, data):
380		if self._td:
381			self._current_tr[-1] += data
382
383	def handle_charref (self, name):
384		self.handle_data (html.unescape ('&#%s;' % name))
385
386	def handle_entityref (self, name):
387		self.handle_data (html.unescape ('&%s;' % name))
388
389	def parse (self, filename):
390		"""Parse the OpenType language system tag registry.
391
392		Args:
393			filename (str): The file name of the registry.
394		"""
395		with open (filename, encoding='utf-8') as f:
396			self.feed (f.read ())
397		expect (self.header)
398		for tag, iso_codes in self.to_bcp_47.items ():
399			for iso_code in iso_codes:
400				self.from_bcp_47[iso_code].add (tag)
401
402	def add_language (self, bcp_47_tag, ot_tag):
403		"""Add a language as if it were in the registry.
404
405		Args:
406			bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
407				a language subtag, and if the language subtag is a
408				macrolanguage, then new languages are added corresponding
409				to the macrolanguages' individual languages with the
410				remainder of the tag appended.
411			ot_tag (str): An OpenType language system tag.
412		"""
413		global bcp_47
414		self.to_bcp_47[ot_tag].add (bcp_47_tag)
415		self.from_bcp_47[bcp_47_tag].add (ot_tag)
416		if bcp_47_tag.lower () not in bcp_47.grandfathered:
417			try:
418				[macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
419				if macrolanguage in bcp_47.macrolanguages:
420					s = set ()
421					for language in bcp_47.macrolanguages[macrolanguage]:
422						if language.lower () not in bcp_47.grandfathered:
423							s.add ('%s-%s' % (language, suffix))
424					bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
425			except ValueError:
426				pass
427
428	@staticmethod
429	def _remove_language (tag_1, dict_1, dict_2):
430		for tag_2 in dict_1.pop (tag_1):
431			dict_2[tag_2].remove (tag_1)
432			if not dict_2[tag_2]:
433				del dict_2[tag_2]
434
435	def remove_language_ot (self, ot_tag):
436		"""Remove an OpenType tag from the registry.
437
438		Args:
439			ot_tag (str): An OpenType tag.
440		"""
441		self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
442
443	def remove_language_bcp_47 (self, bcp_47_tag):
444		"""Remove a BCP 47 tag from the registry.
445
446		Args:
447			bcp_47_tag (str): A BCP 47 tag.
448		"""
449		self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
450
451	def inherit_from_macrolanguages (self):
452		"""Copy mappings from macrolanguages to individual languages.
453
454		If a BCP 47 tag for an individual mapping has no OpenType
455		mapping but its macrolanguage does, the mapping is copied to
456		the individual language. For example, als (Tosk Albanian) has no
457		explicit mapping, so it inherits from sq (Albanian) the mapping
458		to SQI.
459
460		If a BCP 47 tag for a macrolanguage has no OpenType mapping but
461		all of its individual languages do and they all map to the same
462		tags, the mapping is copied to the macrolanguage.
463		"""
464		global bcp_47
465		original_ot_from_bcp_47 = dict (self.from_bcp_47)
466		for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
467			ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
468			if ot_macrolanguages:
469				for ot_macrolanguage in ot_macrolanguages:
470					for language in languages:
471						self.add_language (language, ot_macrolanguage)
472						self.ranks[ot_macrolanguage] += 1
473			else:
474				for language in languages:
475					if language in original_ot_from_bcp_47:
476						if ot_macrolanguages:
477							ml = original_ot_from_bcp_47[language]
478							if ml:
479								ot_macrolanguages &= ml
480							else:
481								pass
482						else:
483							ot_macrolanguages |= original_ot_from_bcp_47[language]
484					else:
485						ot_macrolanguages.clear ()
486					if not ot_macrolanguages:
487						break
488				for ot_macrolanguage in ot_macrolanguages:
489					self.add_language (macrolanguage, ot_macrolanguage)
490
491	def sort_languages (self):
492		"""Sort the values of ``from_bcp_47`` in ascending rank order."""
493		for language, tags in self.from_bcp_47.items ():
494			self.from_bcp_47[language] = sorted (tags,
495					key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
496
497ot = OpenTypeRegistryParser ()
498
499class BCP47Parser (object):
500	"""A parser for the BCP 47 subtag registry.
501
502	Attributes:
503		header (str): The "File-Date" line of the registry.
504		names (Mapping[str, str]): A map of subtags to the names they
505			are given in the registry. Each value is a
506			``'\\n'``-separated list of names.
507		scopes (Mapping[str, str]): A map of language subtags to strings
508			suffixed to language names, including suffixes to explain
509			language scopes.
510		macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
511			language subtags to the sets of language subtags which
512			inherit from them. See
513			``OpenTypeRegistryParser.inherit_from_macrolanguages``.
514		prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
515			subtags to their prefixes.
516		grandfathered (AbstractSet[str]): The set of grandfathered tags,
517			normalized to lowercase.
518
519	"""
520	def __init__ (self):
521		self.header = ''
522		self.names = {}
523		self.scopes = {}
524		self.macrolanguages = collections.defaultdict (set)
525		self.prefixes = collections.defaultdict (set)
526		self.grandfathered = set ()
527
528	def parse (self, filename):
529		"""Parse the BCP 47 subtag registry.
530
531		Args:
532			filename (str): The file name of the registry.
533		"""
534		with open (filename, encoding='utf-8') as f:
535			subtag_type = None
536			subtag = None
537			deprecated = False
538			has_preferred_value = False
539			line_buffer = ''
540			for line in itertools.chain (f, ['']):
541				line = line.rstrip ()
542				if line.startswith (' '):
543					line_buffer += line[1:]
544					continue
545				line, line_buffer = line_buffer, line
546				if line.startswith ('Type: '):
547					subtag_type = line.split (' ')[1]
548					deprecated = False
549					has_preferred_value = False
550				elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
551					subtag = line.split (' ')[1]
552					if subtag_type == 'grandfathered':
553						self.grandfathered.add (subtag.lower ())
554				elif line.startswith ('Description: '):
555					description = line.split (' ', 1)[1].replace (' (individual language)', '')
556					description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
557							description)
558					if subtag in self.names:
559						self.names[subtag] += '\n' + description
560					else:
561						self.names[subtag] = description
562				elif subtag_type == 'language' or subtag_type == 'grandfathered':
563					if line.startswith ('Scope: '):
564						scope = line.split (' ')[1]
565						if scope == 'macrolanguage':
566							scope = ' [macrolanguage]'
567						elif scope == 'collection':
568							scope = ' [family]'
569						else:
570							continue
571						self.scopes[subtag] = scope
572					elif line.startswith ('Deprecated: '):
573						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
574						deprecated = True
575					elif deprecated and line.startswith ('Comments: see '):
576						# If a subtag is split into multiple replacement subtags,
577						# it essentially represents a macrolanguage.
578						for language in line.replace (',', '').split (' ')[2:]:
579							self._add_macrolanguage (subtag, language)
580					elif line.startswith ('Preferred-Value: '):
581						# If a subtag is deprecated in favor of a single replacement subtag,
582						# it is either a dialect or synonym of the preferred subtag. Either
583						# way, it is close enough to the truth to consider the replacement
584						# the macrolanguage of the deprecated language.
585						has_preferred_value = True
586						macrolanguage = line.split (' ')[1]
587						self._add_macrolanguage (macrolanguage, subtag)
588					elif not has_preferred_value and line.startswith ('Macrolanguage: '):
589						self._add_macrolanguage (line.split (' ')[1], subtag)
590				elif subtag_type == 'variant':
591					if line.startswith ('Deprecated: '):
592						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
593					elif line.startswith ('Prefix: '):
594						self.prefixes[subtag].add (line.split (' ')[1])
595				elif line.startswith ('File-Date: '):
596					self.header = line
597		expect (self.header)
598
599	def _add_macrolanguage (self, macrolanguage, language):
600		global ot
601		if language not in ot.from_bcp_47:
602			for l in self.macrolanguages.get (language, set ()):
603				self._add_macrolanguage (macrolanguage, l)
604		if macrolanguage not in ot.from_bcp_47:
605			for ls in list (self.macrolanguages.values ()):
606				if macrolanguage in ls:
607					ls.add (language)
608					return
609		self.macrolanguages[macrolanguage].add (language)
610
611	def remove_extra_macrolanguages (self):
612		"""Make every language have at most one macrolanguage."""
613		inverted = collections.defaultdict (list)
614		for macrolanguage, languages in self.macrolanguages.items ():
615			for language in languages:
616				inverted[language].append (macrolanguage)
617		for language, macrolanguages in inverted.items ():
618			if len (macrolanguages) > 1:
619				macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
620				biggest_macrolanguage = macrolanguages.pop ()
621				for macrolanguage in macrolanguages:
622					self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
623
624	def _get_name_piece (self, subtag):
625		"""Return the first name of a subtag plus its scope suffix.
626
627		Args:
628			subtag (str): A BCP 47 subtag.
629
630		Returns:
631			The name form of ``subtag``.
632		"""
633		return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
634
635	def get_name (self, lt):
636		"""Return the names of the subtags in a language tag.
637
638		Args:
639			lt (LanguageTag): A BCP 47 language tag.
640
641		Returns:
642			The name form of ``lt``.
643		"""
644		name = self._get_name_piece (lt.language)
645		if lt.script:
646			name += '; ' + self._get_name_piece (lt.script.title ())
647		if lt.region:
648			name += '; ' + self._get_name_piece (lt.region.upper ())
649		if lt.variant:
650			name += '; ' + self._get_name_piece (lt.variant)
651		return name
652
653bcp_47 = BCP47Parser ()
654
655ot.parse (sys.argv[1])
656bcp_47.parse (sys.argv[2])
657
658ot.add_language ('ary', 'MOR')
659
660ot.add_language ('ath', 'ATH')
661
662ot.add_language ('bai', 'BML')
663
664ot.ranks['BAL'] = ot.ranks['KAR'] + 1
665
666ot.add_language ('ber', 'BBR')
667
668ot.remove_language_ot ('PGR')
669ot.add_language ('el-polyton', 'PGR')
670
671bcp_47.macrolanguages['et'] = {'ekk'}
672
673bcp_47.names['flm'] = 'Falam Chin'
674bcp_47.scopes['flm'] = ' (retired code)'
675bcp_47.macrolanguages['flm'] = {'cfm'}
676
677ot.ranks['FNE'] = ot.ranks['TNE'] + 1
678
679ot.add_language ('und-fonipa', 'IPPH')
680
681ot.add_language ('und-fonnapa', 'APPH')
682
683ot.remove_language_ot ('IRT')
684ot.add_language ('ga-Latg', 'IRT')
685
686ot.add_language ('hy-arevmda', 'HYE')
687
688ot.remove_language_ot ('KGE')
689ot.add_language ('und-Geok', 'KGE')
690
691bcp_47.macrolanguages['id'] = {'in'}
692
693bcp_47.macrolanguages['ijo'] = {'ijc'}
694
695ot.add_language ('kht', 'KHN')
696ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
697ot.ranks['KHN'] = ot.ranks['KHT'] + 1
698
699ot.ranks['LCR'] = ot.ranks['MCR'] + 1
700
701ot.names['MAL'] = 'Malayalam Traditional'
702ot.ranks['MLR'] += 1
703
704bcp_47.names['mhv'] = 'Arakanese'
705bcp_47.scopes['mhv'] = ' (retired code)'
706
707ot.add_language ('no', 'NOR')
708
709ot.add_language ('oc-provenc', 'PRO')
710
711ot.add_language ('qu', 'QUZ')
712ot.add_language ('qub', 'QWH')
713ot.add_language ('qud', 'QVI')
714ot.add_language ('qug', 'QVI')
715ot.add_language ('qul', 'QUH')
716ot.add_language ('qup', 'QVI')
717ot.add_language ('qur', 'QWH')
718ot.add_language ('qus', 'QUH')
719ot.add_language ('quw', 'QVI')
720ot.add_language ('qux', 'QWH')
721ot.add_language ('qva', 'QWH')
722ot.add_language ('qvh', 'QWH')
723ot.add_language ('qvj', 'QVI')
724ot.add_language ('qvl', 'QWH')
725ot.add_language ('qvm', 'QWH')
726ot.add_language ('qvn', 'QWH')
727ot.add_language ('qvo', 'QVI')
728ot.add_language ('qvp', 'QWH')
729ot.add_language ('qvw', 'QWH')
730ot.add_language ('qvz', 'QVI')
731ot.add_language ('qwa', 'QWH')
732ot.add_language ('qws', 'QWH')
733ot.add_language ('qxa', 'QWH')
734ot.add_language ('qxc', 'QWH')
735ot.add_language ('qxh', 'QWH')
736ot.add_language ('qxl', 'QVI')
737ot.add_language ('qxn', 'QWH')
738ot.add_language ('qxo', 'QWH')
739ot.add_language ('qxr', 'QVI')
740ot.add_language ('qxt', 'QWH')
741ot.add_language ('qxw', 'QWH')
742
743bcp_47.macrolanguages['ro'].remove ('mo')
744bcp_47.macrolanguages['ro-MD'].add ('mo')
745
746ot.remove_language_ot ('SYRE')
747ot.remove_language_ot ('SYRJ')
748ot.remove_language_ot ('SYRN')
749ot.add_language ('und-Syre', 'SYRE')
750ot.add_language ('und-Syrj', 'SYRJ')
751ot.add_language ('und-Syrn', 'SYRN')
752
753bcp_47.names['xst'] = "Silt'e"
754bcp_47.scopes['xst'] = ' (retired code)'
755bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
756
757ot.add_language ('xwo', 'TOD')
758
759ot.remove_language_ot ('ZHH')
760ot.remove_language_ot ('ZHP')
761ot.remove_language_ot ('ZHT')
762ot.remove_language_ot ('ZHTM')
763bcp_47.macrolanguages['zh'].remove ('lzh')
764bcp_47.macrolanguages['zh'].remove ('yue')
765ot.add_language ('zh-Hant-MO', 'ZHH')
766ot.add_language ('zh-Hant-MO', 'ZHTM')
767ot.add_language ('zh-Hant-HK', 'ZHH')
768ot.add_language ('zh-Hans', 'ZHS')
769ot.add_language ('zh-Hant', 'ZHT')
770ot.add_language ('zh-HK', 'ZHH')
771ot.add_language ('zh-MO', 'ZHH')
772ot.add_language ('zh-MO', 'ZHTM')
773ot.add_language ('zh-TW', 'ZHT')
774ot.add_language ('lzh', 'ZHT')
775ot.add_language ('lzh-Hans', 'ZHS')
776ot.add_language ('yue', 'ZHH')
777ot.add_language ('yue-Hans', 'ZHS')
778
779bcp_47.macrolanguages['zom'] = {'yos'}
780
781def rank_delta (bcp_47, ot):
782	"""Return a delta to apply to a BCP 47 tag's rank.
783
784	Most OpenType tags have a constant rank, but a few have ranks that
785	depend on the BCP 47 tag.
786
787	Args:
788		bcp_47 (str): A BCP 47 tag.
789		ot (str): An OpenType tag to.
790
791	Returns:
792		A number to add to ``ot``'s rank when sorting ``bcp_47``'s
793		OpenType equivalents.
794	"""
795	if bcp_47 == 'ak' and ot == 'AKA':
796		return -1
797	if bcp_47 == 'tw' and ot == 'TWI':
798		return -1
799	return 0
800
801disambiguation = {
802	'ALT': 'alt',
803	'ARK': 'rki',
804	'ATH': 'ath',
805	'BHI': 'bhb',
806	'BLN': 'bjt',
807	'BTI': 'beb',
808	'CCHN': 'cco',
809	'CMR': 'swb',
810	'CPP': 'crp',
811	'CRR': 'crx',
812	'DUJ': 'dwu',
813	'ECR': 'crj',
814	'HAL': 'cfm',
815	'HND': 'hnd',
816	'HYE': 'hyw',
817	'KIS': 'kqs',
818	'KUI': 'uki',
819	'LRC': 'bqi',
820	'NDB': 'nd',
821	'NIS': 'njz',
822	'PLG': 'pce',
823	'PRO': 'pro',
824	'QIN': 'bgr',
825	'QUH': 'quh',
826	'QVI': 'qvi',
827	'QWH': 'qwh',
828	'SIG': 'stv',
829	'SRB': 'sr',
830	'SXT': 'xnj',
831	'ZHH': 'zh-HK',
832	'ZHS': 'zh-Hans',
833	'ZHT': 'zh-Hant',
834	'ZHTM': 'zh-MO',
835}
836
837ot.inherit_from_macrolanguages ()
838bcp_47.remove_extra_macrolanguages ()
839ot.inherit_from_macrolanguages ()
840ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
841ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
842for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
843	possible_bcp_47_tag = tricky_ot_tag.lower ()
844	if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
845		ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
846		bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
847ot.sort_languages ()
848
849print ('/* == Start of generated table == */')
850print ('/*')
851print (' * The following table is generated by running:')
852print (' *')
853print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
854print (' *')
855print (' * on files with these headers:')
856print (' *')
857print (' * %s' % ot.header.strip ())
858print (' * %s' % bcp_47.header)
859print (' */')
860print ()
861print ('#ifndef HB_OT_TAG_TABLE_HH')
862print ('#define HB_OT_TAG_TABLE_HH')
863print ()
864print ('static const LangTag ot_languages[] = {')
865
866def hb_tag (tag):
867	"""Convert a tag to ``HB_TAG`` form.
868
869	Args:
870		tag (str): An OpenType tag.
871
872	Returns:
873		A snippet of C++ representing ``tag``.
874	"""
875	if tag == DEFAULT_LANGUAGE_SYSTEM:
876		return 'HB_TAG_NONE\t       '
877	return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
878
879def get_variant_set (name):
880	"""Return a set of variant language names from a name.
881
882	Args:
883		name (str): A list of language names from the BCP 47 registry,
884			joined on ``'\\n'``.
885
886	Returns:
887		A set of normalized language names.
888	"""
889	return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
890			.encode ('ASCII', 'ignore')
891			.strip ()
892			for n in re.split ('[\n(),]', name) if n)
893
894def language_name_intersection (a, b):
895	"""Return the names in common between two language names.
896
897	Args:
898		a (str): A list of language names from the BCP 47 registry,
899			joined on ``'\\n'``.
900		b (str): A list of language names from the BCP 47 registry,
901			joined on ``'\\n'``.
902
903	Returns:
904		The normalized language names shared by ``a`` and ``b``.
905	"""
906	return get_variant_set (a).intersection (get_variant_set (b))
907
908def get_matching_language_name (intersection, candidates):
909	return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
910
911def same_tag (bcp_47_tag, ot_tags):
912	return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
913
914for language, tags in sorted (ot.from_bcp_47.items ()):
915	if language == '' or '-' in language:
916		continue
917	commented_out = same_tag (language, tags)
918	for i, tag in enumerate (tags, start=1):
919		print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else '  ', language, hb_tag (tag)), end='')
920		if commented_out:
921			print ('*/', end='')
922		print ('\t/* ', end='')
923		bcp_47_name = bcp_47.names.get (language, '')
924		bcp_47_name_candidates = bcp_47_name.split ('\n')
925		ot_name = ot.names[tag]
926		scope = bcp_47.scopes.get (language, '')
927		if tag == DEFAULT_LANGUAGE_SYSTEM:
928			write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
929		else:
930			intersection = language_name_intersection (bcp_47_name, ot_name)
931			if not intersection:
932				write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
933			else:
934				name = get_matching_language_name (intersection, bcp_47_name_candidates)
935				bcp_47.names[language] = name
936				write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
937		print (' */')
938
939print ('};')
940print ()
941
942print ('/**')
943print (' * hb_ot_tags_from_complex_language:')
944print (' * @lang_str: a BCP 47 language tag to convert.')
945print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
946print (' * conversion.')
947print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
948print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
949print (' * @tags: array of size at least @language_count to store the language tag')
950print (' * results')
951print (' *')
952print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
953print (' *')
954print (' * Return value: Whether any language systems were retrieved.')
955print (' **/')
956print ('static bool')
957print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
958print ('\t\t\t\t  const char   *limit,')
959print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
960print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
961print ('{')
962
963def print_subtag_matches (subtag, new_line):
964	if subtag:
965		if new_line:
966			print ()
967			print ('\t&& ', end='')
968		print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
969
970complex_tags = collections.defaultdict (list)
971for initial, group in itertools.groupby ((lt_tags for lt_tags in [
972			(LanguageTag (language), tags)
973			for language, tags in sorted (ot.from_bcp_47.items (),
974				key=lambda i: (-len (i[0]), i[0]))
975		] if lt_tags[0].is_complex ()),
976		key=lambda lt_tags: lt_tags[0].get_group ()):
977	complex_tags[initial] += group
978
979for initial, items in sorted (complex_tags.items ()):
980	if initial != 'und':
981		continue
982	for lt, tags in items:
983		if lt.variant in bcp_47.prefixes:
984			expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
985					'%s is not a valid prefix of %s' % (lt.language, lt.variant))
986		print ('  if (', end='')
987		print_subtag_matches (lt.script, False)
988		print_subtag_matches (lt.region, False)
989		print_subtag_matches (lt.variant, False)
990		print (')')
991		print ('  {')
992		write ('    /* %s */' % bcp_47.get_name (lt))
993		print ()
994		if len (tags) == 1:
995			write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
996			print ()
997			print ('    *count = 1;')
998		else:
999			print ('    hb_tag_t possible_tags[] = {')
1000			for tag in tags:
1001				write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1002				print ()
1003			print ('    };')
1004			print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
1005			print ('      tags[i] = possible_tags[i];')
1006			print ('    *count = i;')
1007		print ('    return true;')
1008		print ('  }')
1009
1010print ('  switch (lang_str[0])')
1011print ('  {')
1012for initial, items in sorted (complex_tags.items ()):
1013	if initial == 'und':
1014		continue
1015	print ("  case '%s':" % initial)
1016	for lt, tags in items:
1017		print ('    if (', end='')
1018		script = lt.script
1019		region = lt.region
1020		if lt.grandfathered:
1021			print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1022		else:
1023			string_literal = lt.language[1:] + '-'
1024			if script:
1025				string_literal += script
1026				script = None
1027				if region:
1028					string_literal += '-' + region
1029					region = None
1030			if string_literal[-1] == '-':
1031				print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1032			else:
1033				print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1034		print_subtag_matches (script, True)
1035		print_subtag_matches (region, True)
1036		print_subtag_matches (lt.variant, True)
1037		print (')')
1038		print ('    {')
1039		write ('      /* %s */' % bcp_47.get_name (lt))
1040		print ()
1041		if len (tags) == 1:
1042			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1043			print ()
1044			print ('      *count = 1;')
1045		else:
1046			print ('      unsigned int i;')
1047			print ('      hb_tag_t possible_tags[] = {')
1048			for tag in tags:
1049				write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1050				print ()
1051			print ('      };')
1052			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1053			print ('\ttags[i] = possible_tags[i];')
1054			print ('      *count = i;')
1055		print ('      return true;')
1056		print ('    }')
1057	print ('    break;')
1058
1059print ('  }')
1060print ('  return false;')
1061print ('}')
1062print ()
1063print ('/**')
1064print (' * hb_ot_ambiguous_tag_to_language')
1065print (' * @tag: A language tag.')
1066print (' *')
1067print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1068print (' * many language tags) and the best tag is not the alphabetically first, or if')
1069print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1070print (' * in #ot_languages.')
1071print (' *')
1072print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1073print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1074print (' **/')
1075print ('static hb_language_t')
1076print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1077print ('{')
1078print ('  switch (tag)')
1079print ('  {')
1080
1081def verify_disambiguation_dict ():
1082	"""Verify and normalize ``disambiguation``.
1083
1084	``disambiguation`` is a map of ambiguous OpenType language system
1085	tags to the particular BCP 47 tags they correspond to. This function
1086	checks that all its keys really are ambiguous and that each key's
1087	value is valid for that key. It checks that no ambiguous tag is
1088	missing, except when it can figure out which BCP 47 tag is the best
1089	by itself.
1090
1091	It modifies ``disambiguation`` to remove keys whose values are the
1092	same as those that the fallback would return anyway, and to add
1093	ambiguous keys whose disambiguations it determined automatically.
1094
1095	Raises:
1096		AssertionError: Verification failed.
1097	"""
1098	global bcp_47
1099	global disambiguation
1100	global ot
1101	for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1102		if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
1103			primary_tags = []
1104		else:
1105			primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1106		if len (primary_tags) == 1:
1107			expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1108			if '-' in primary_tags[0]:
1109				disambiguation[ot_tag] = primary_tags[0]
1110			else:
1111				first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
1112				if primary_tags[0] != first_tag:
1113					disambiguation[ot_tag] = primary_tags[0]
1114		elif len (primary_tags) == 0:
1115			expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1116		else:
1117			macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1118			if len (macrolanguages) != 1:
1119				macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1120			if len (macrolanguages) != 1:
1121				macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1122			if len (macrolanguages) != 1:
1123				expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1124				expect (disambiguation[ot_tag] in bcp_47_tags,
1125						'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1126			elif ot_tag not in disambiguation:
1127				disambiguation[ot_tag] = macrolanguages[0]
1128			different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1129			if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
1130				del disambiguation[ot_tag]
1131	for ot_tag in disambiguation.keys ():
1132		expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1133
1134verify_disambiguation_dict ()
1135for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1136	write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1137	print ()
1138	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1139	print ()
1140
1141print ('  default:')
1142print ('    return HB_LANGUAGE_INVALID;')
1143print ('  }')
1144print ('}')
1145
1146print ()
1147print ('#endif /* HB_OT_TAG_TABLE_HH */')
1148print ()
1149print ('/* == End of generated table == */')
1150
1151