• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2
3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice
4versa.
5
6It creates a ``const LangTag[]``, matching the tags from the OpenType
7languages system tag list to the language subtags of the BCP 47 language
8subtag registry, with some manual adjustments. The mappings are
9supplemented with macrolanguages' sublanguages and retired codes'
10replacements, according to BCP 47 and some manual additions where BCP 47
11omits a retired code entirely.
12
13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16multiple BCP 47 tags) are listed here, except when the alphabetically
17first BCP 47 tag happens to be the chosen disambiguated tag. In that
18case, the fallback behavior will choose the right tag anyway.
19
20usage: ./gen-tag-table.py languagetags language-subtag-registry
21
22Input files:
23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
25"""
26
27import collections
28import html
29from html.parser import HTMLParser
30import itertools
31import re
32import sys
33import unicodedata
34
35if len (sys.argv) != 3:
36	sys.exit (__doc__)
37
38def expect (condition, message=None):
39	if not condition:
40		if message is None:
41			raise AssertionError
42		raise AssertionError (message)
43
44def write (s):
45	sys.stdout.flush ()
46	sys.stdout.buffer.write (s.encode ('utf-8'))
47
48DEFAULT_LANGUAGE_SYSTEM = ''
49
50# from https://www-01.sil.org/iso639-3/iso-639-3.tab
51ISO_639_3_TO_1 = {
52	'aar': 'aa',
53	'abk': 'ab',
54	'afr': 'af',
55	'aka': 'ak',
56	'amh': 'am',
57	'ara': 'ar',
58	'arg': 'an',
59	'asm': 'as',
60	'ava': 'av',
61	'ave': 'ae',
62	'aym': 'ay',
63	'aze': 'az',
64	'bak': 'ba',
65	'bam': 'bm',
66	'bel': 'be',
67	'ben': 'bn',
68	'bis': 'bi',
69	'bod': 'bo',
70	'bos': 'bs',
71	'bre': 'br',
72	'bul': 'bg',
73	'cat': 'ca',
74	'ces': 'cs',
75	'cha': 'ch',
76	'che': 'ce',
77	'chu': 'cu',
78	'chv': 'cv',
79	'cor': 'kw',
80	'cos': 'co',
81	'cre': 'cr',
82	'cym': 'cy',
83	'dan': 'da',
84	'deu': 'de',
85	'div': 'dv',
86	'dzo': 'dz',
87	'ell': 'el',
88	'eng': 'en',
89	'epo': 'eo',
90	'est': 'et',
91	'eus': 'eu',
92	'ewe': 'ee',
93	'fao': 'fo',
94	'fas': 'fa',
95	'fij': 'fj',
96	'fin': 'fi',
97	'fra': 'fr',
98	'fry': 'fy',
99	'ful': 'ff',
100	'gla': 'gd',
101	'gle': 'ga',
102	'glg': 'gl',
103	'glv': 'gv',
104	'grn': 'gn',
105	'guj': 'gu',
106	'hat': 'ht',
107	'hau': 'ha',
108	'hbs': 'sh',
109	'heb': 'he',
110	'her': 'hz',
111	'hin': 'hi',
112	'hmo': 'ho',
113	'hrv': 'hr',
114	'hun': 'hu',
115	'hye': 'hy',
116	'ibo': 'ig',
117	'ido': 'io',
118	'iii': 'ii',
119	'iku': 'iu',
120	'ile': 'ie',
121	'ina': 'ia',
122	'ind': 'id',
123	'ipk': 'ik',
124	'isl': 'is',
125	'ita': 'it',
126	'jav': 'jv',
127	'jpn': 'ja',
128	'kal': 'kl',
129	'kan': 'kn',
130	'kas': 'ks',
131	'kat': 'ka',
132	'kau': 'kr',
133	'kaz': 'kk',
134	'khm': 'km',
135	'kik': 'ki',
136	'kin': 'rw',
137	'kir': 'ky',
138	'kom': 'kv',
139	'kon': 'kg',
140	'kor': 'ko',
141	'kua': 'kj',
142	'kur': 'ku',
143	'lao': 'lo',
144	'lat': 'la',
145	'lav': 'lv',
146	'lim': 'li',
147	'lin': 'ln',
148	'lit': 'lt',
149	'ltz': 'lb',
150	'lub': 'lu',
151	'lug': 'lg',
152	'mah': 'mh',
153	'mal': 'ml',
154	'mar': 'mr',
155	'mkd': 'mk',
156	'mlg': 'mg',
157	'mlt': 'mt',
158	'mol': 'mo',
159	'mon': 'mn',
160	'mri': 'mi',
161	'msa': 'ms',
162	'mya': 'my',
163	'nau': 'na',
164	'nav': 'nv',
165	'nbl': 'nr',
166	'nde': 'nd',
167	'ndo': 'ng',
168	'nep': 'ne',
169	'nld': 'nl',
170	'nno': 'nn',
171	'nob': 'nb',
172	'nor': 'no',
173	'nya': 'ny',
174	'oci': 'oc',
175	'oji': 'oj',
176	'ori': 'or',
177	'orm': 'om',
178	'oss': 'os',
179	'pan': 'pa',
180	'pli': 'pi',
181	'pol': 'pl',
182	'por': 'pt',
183	'pus': 'ps',
184	'que': 'qu',
185	'roh': 'rm',
186	'ron': 'ro',
187	'run': 'rn',
188	'rus': 'ru',
189	'sag': 'sg',
190	'san': 'sa',
191	'sin': 'si',
192	'slk': 'sk',
193	'slv': 'sl',
194	'sme': 'se',
195	'smo': 'sm',
196	'sna': 'sn',
197	'snd': 'sd',
198	'som': 'so',
199	'sot': 'st',
200	'spa': 'es',
201	'sqi': 'sq',
202	'srd': 'sc',
203	'srp': 'sr',
204	'ssw': 'ss',
205	'sun': 'su',
206	'swa': 'sw',
207	'swe': 'sv',
208	'tah': 'ty',
209	'tam': 'ta',
210	'tat': 'tt',
211	'tel': 'te',
212	'tgk': 'tg',
213	'tgl': 'tl',
214	'tha': 'th',
215	'tir': 'ti',
216	'ton': 'to',
217	'tsn': 'tn',
218	'tso': 'ts',
219	'tuk': 'tk',
220	'tur': 'tr',
221	'twi': 'tw',
222	'uig': 'ug',
223	'ukr': 'uk',
224	'urd': 'ur',
225	'uzb': 'uz',
226	'ven': 've',
227	'vie': 'vi',
228	'vol': 'vo',
229	'wln': 'wa',
230	'wol': 'wo',
231	'xho': 'xh',
232	'yid': 'yi',
233	'yor': 'yo',
234	'zha': 'za',
235	'zho': 'zh',
236	'zul': 'zu',
237}
238
239class LanguageTag (object):
240	"""A BCP 47 language tag.
241
242	Attributes:
243		subtags (List[str]): The list of subtags in this tag.
244		grandfathered (bool): Whether this tag is grandfathered. If
245			``true``, the entire lowercased tag is the ``language``
246			and the other subtag fields are empty.
247		language (str): The language subtag.
248		script (str): The script subtag.
249		region (str): The region subtag.
250		variant (str): The variant subtag.
251
252	Args:
253		tag (str): A BCP 47 language tag.
254
255	"""
256	def __init__ (self, tag):
257		global bcp_47
258		self.subtags = tag.lower ().split ('-')
259		self.grandfathered = tag.lower () in bcp_47.grandfathered
260		if self.grandfathered:
261			self.language = tag.lower ()
262			self.script = ''
263			self.region = ''
264			self.variant = ''
265		else:
266			self.language = self.subtags[0]
267			self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
268			self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
269			self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
270
271	def __str__(self):
272		return '-'.join(self.subtags)
273
274	def __repr__ (self):
275		return 'LanguageTag(%r)' % str(self)
276
277	@staticmethod
278	def _find_first (function, sequence):
279		try:
280			return next (iter (filter (function, sequence)))
281		except StopIteration:
282			return None
283
284	def is_complex (self):
285		"""Return whether this tag is too complex to represent as a
286		``LangTag`` in the generated code.
287
288		Complex tags need to be handled in
289		``hb_ot_tags_from_complex_language``.
290
291		Returns:
292			Whether this tag is complex.
293		"""
294		return not (len (self.subtags) == 1
295			or self.grandfathered
296			and len (self.subtags[1]) != 3
297			and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
298
299	def get_group (self):
300		"""Return the group into which this tag should be categorized in
301		``hb_ot_tags_from_complex_language``.
302
303		The group is the first letter of the tag, or ``'und'`` if this tag
304		should not be matched in a ``switch`` statement in the generated
305		code.
306
307		Returns:
308			This tag's group.
309		"""
310		return ('und'
311			if (self.language == 'und'
312				or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
313			else self.language[0])
314
315class OpenTypeRegistryParser (HTMLParser):
316	"""A parser for the OpenType language system tag registry.
317
318	Attributes:
319		header (str): The "last updated" line of the registry.
320		names (Mapping[str, str]): A map of language system tags to the
321			names they are given in the registry.
322		ranks (DefaultDict[str, int]): A map of language system tags to
323			numbers. If a single BCP 47 tag corresponds to multiple
324			OpenType tags, the tags are ordered in increasing order by
325			rank. The rank is based on the number of BCP 47 tags
326			associated with a tag, though it may be manually modified.
327		to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
328			OpenType language system tags to sets of BCP 47 tags.
329		from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
330			inverted. Its values start as unsorted sets;
331			``sort_languages`` converts them to sorted lists.
332		from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
333			A copy of ``from_bcp_47``. It starts as ``None`` and is
334			populated at the beginning of the first call to
335			``inherit_from_macrolanguages``.
336
337	"""
338	def __init__ (self):
339		HTMLParser.__init__ (self)
340		self.header = ''
341		self.names = {}
342		self.ranks = collections.defaultdict (int)
343		self.to_bcp_47 = collections.defaultdict (set)
344		self.from_bcp_47 = collections.defaultdict (set)
345		self.from_bcp_47_uninherited = None
346		# Whether the parser is in a <td> element
347		self._td = False
348		# Whether the parser is after a <br> element within the current <tr> element
349		self._br = False
350		# The text of the <td> elements of the current <tr> element.
351		self._current_tr = []
352
353	def handle_starttag (self, tag, attrs):
354		if tag == 'br':
355			self._br = True
356		elif tag == 'meta':
357			for attr, value in attrs:
358				if attr == 'name' and value == 'updated_at':
359					self.header = self.get_starttag_text ()
360					break
361		elif tag == 'td':
362			self._td = True
363			self._current_tr.append ('')
364		elif tag == 'tr':
365			self._br = False
366			self._current_tr = []
367
368	def handle_endtag (self, tag):
369		if tag == 'td':
370			self._td = False
371		elif tag == 'tr' and self._current_tr:
372			expect (2 <= len (self._current_tr) <= 3)
373			name = self._current_tr[0].strip ()
374			tag = self._current_tr[1].strip ("\t\n\v\f\r '")
375			rank = 0
376			if len (tag) > 4:
377				expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
378				name += ' (deprecated)'
379				tag = tag.split (' ')[0]
380				rank = 1
381			self.names[tag] = re.sub (' languages$', '', name)
382			if not self._current_tr[2]:
383				return
384			iso_codes = self._current_tr[2].strip ()
385			self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
386			rank += 2 * len (self.to_bcp_47[tag])
387			self.ranks[tag] = rank
388
389	def handle_data (self, data):
390		if self._td and not self._br:
391			self._current_tr[-1] += data
392
393	def handle_charref (self, name):
394		self.handle_data (html.unescape ('&#%s;' % name))
395
396	def handle_entityref (self, name):
397		self.handle_data (html.unescape ('&%s;' % name))
398
399	def parse (self, filename):
400		"""Parse the OpenType language system tag registry.
401
402		Args:
403			filename (str): The file name of the registry.
404		"""
405		with open (filename, encoding='utf-8') as f:
406			self.feed (f.read ())
407		expect (self.header)
408		for tag, iso_codes in self.to_bcp_47.items ():
409			for iso_code in iso_codes:
410				self.from_bcp_47[iso_code].add (tag)
411
412	def add_language (self, bcp_47_tag, ot_tag):
413		"""Add a language as if it were in the registry.
414
415		Args:
416			bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
417				a language subtag, and if the language subtag is a
418				macrolanguage, then new languages are added corresponding
419				to the macrolanguages' individual languages with the
420				remainder of the tag appended.
421			ot_tag (str): An OpenType language system tag.
422		"""
423		global bcp_47
424		self.to_bcp_47[ot_tag].add (bcp_47_tag)
425		self.from_bcp_47[bcp_47_tag].add (ot_tag)
426		if bcp_47_tag.lower () not in bcp_47.grandfathered:
427			try:
428				[macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
429				if macrolanguage in bcp_47.macrolanguages:
430					s = set ()
431					for language in bcp_47.macrolanguages[macrolanguage]:
432						if language.lower () not in bcp_47.grandfathered:
433							s.add ('%s-%s' % (language, suffix))
434					bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
435			except ValueError:
436				pass
437
438	@staticmethod
439	def _remove_language (tag_1, dict_1, dict_2):
440		for tag_2 in dict_1.pop (tag_1):
441			dict_2[tag_2].remove (tag_1)
442			if not dict_2[tag_2]:
443				del dict_2[tag_2]
444
445	def remove_language_ot (self, ot_tag):
446		"""Remove an OpenType tag from the registry.
447
448		Args:
449			ot_tag (str): An OpenType tag.
450		"""
451		self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
452
453	def remove_language_bcp_47 (self, bcp_47_tag):
454		"""Remove a BCP 47 tag from the registry.
455
456		Args:
457			bcp_47_tag (str): A BCP 47 tag.
458		"""
459		self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
460
461	def inherit_from_macrolanguages (self):
462		"""Copy mappings from macrolanguages to individual languages.
463
464		If a BCP 47 tag for an individual mapping has no OpenType
465		mapping but its macrolanguage does, the mapping is copied to
466		the individual language. For example, als (Tosk Albanian) has no
467		explicit mapping, so it inherits from sq (Albanian) the mapping
468		to SQI.
469
470		However, if an OpenType tag maps to a BCP 47 macrolanguage and
471		some but not all of its individual languages, the mapping is not
472		inherited from the macrolanguage to the missing individual
473		languages. For example, INUK (Nunavik Inuktitut) is mapped to
474		ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
475		ikt (Inuinnaqtun, which is an individual language of iu), so
476		this method does not add a mapping from ikt to INUK.
477
478		If a BCP 47 tag for a macrolanguage has no OpenType mapping but
479		some of its individual languages do, their mappings are copied
480		to the macrolanguage.
481		"""
482		global bcp_47
483		first_time = self.from_bcp_47_uninherited is None
484		if first_time:
485			self.from_bcp_47_uninherited = dict (self.from_bcp_47)
486		for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
487			ot_macrolanguages = {
488				ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
489			}
490			blocked_ot_macrolanguages = set ()
491			if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
492				for ot_macrolanguage in ot_macrolanguages:
493					round_trip_macrolanguages = {
494						l for l in self.to_bcp_47[ot_macrolanguage]
495						if 'retired code' not in bcp_47.scopes.get (l, '')
496					}
497					round_trip_languages = {
498						l for l in languages
499						if 'retired code' not in bcp_47.scopes.get (l, '')
500					}
501					intersection = round_trip_macrolanguages & round_trip_languages
502					if intersection and intersection != round_trip_languages:
503						blocked_ot_macrolanguages.add (ot_macrolanguage)
504			if ot_macrolanguages:
505				for ot_macrolanguage in ot_macrolanguages:
506					if ot_macrolanguage not in blocked_ot_macrolanguages:
507						for language in languages:
508							self.add_language (language, ot_macrolanguage)
509							if not blocked_ot_macrolanguages:
510								self.ranks[ot_macrolanguage] += 1
511			elif first_time:
512				for language in languages:
513					if language in self.from_bcp_47_uninherited:
514						ot_macrolanguages |= self.from_bcp_47_uninherited[language]
515					else:
516						ot_macrolanguages.clear ()
517					if not ot_macrolanguages:
518						break
519				for ot_macrolanguage in ot_macrolanguages:
520					self.add_language (macrolanguage, ot_macrolanguage)
521
522	def sort_languages (self):
523		"""Sort the values of ``from_bcp_47`` in ascending rank order."""
524		for language, tags in self.from_bcp_47.items ():
525			self.from_bcp_47[language] = sorted (tags,
526					key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
527
528ot = OpenTypeRegistryParser ()
529
530class BCP47Parser (object):
531	"""A parser for the BCP 47 subtag registry.
532
533	Attributes:
534		header (str): The "File-Date" line of the registry.
535		names (Mapping[str, str]): A map of subtags to the names they
536			are given in the registry. Each value is a
537			``'\\n'``-separated list of names.
538		scopes (Mapping[str, str]): A map of language subtags to strings
539			suffixed to language names, including suffixes to explain
540			language scopes.
541		macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
542			language subtags to the sets of language subtags which
543			inherit from them. See
544			``OpenTypeRegistryParser.inherit_from_macrolanguages``.
545		prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
546			subtags to their prefixes.
547		grandfathered (AbstractSet[str]): The set of grandfathered tags,
548			normalized to lowercase.
549
550	"""
551	def __init__ (self):
552		self.header = ''
553		self.names = {}
554		self.scopes = {}
555		self.macrolanguages = collections.defaultdict (set)
556		self.prefixes = collections.defaultdict (set)
557		self.grandfathered = set ()
558
559	def parse (self, filename):
560		"""Parse the BCP 47 subtag registry.
561
562		Args:
563			filename (str): The file name of the registry.
564		"""
565		with open (filename, encoding='utf-8') as f:
566			subtag_type = None
567			subtag = None
568			deprecated = False
569			has_preferred_value = False
570			line_buffer = ''
571			for line in itertools.chain (f, ['']):
572				line = line.rstrip ()
573				if line.startswith (' '):
574					line_buffer += line[1:]
575					continue
576				line, line_buffer = line_buffer, line
577				if line.startswith ('Type: '):
578					subtag_type = line.split (' ')[1]
579					deprecated = False
580					has_preferred_value = False
581				elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
582					subtag = line.split (' ')[1]
583					if subtag_type == 'grandfathered':
584						self.grandfathered.add (subtag.lower ())
585				elif line.startswith ('Description: '):
586					description = line.split (' ', 1)[1].replace (' (individual language)', '')
587					description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '',
588							description)
589					if subtag in self.names:
590						self.names[subtag] += '\n' + description
591					else:
592						self.names[subtag] = description
593				elif subtag_type == 'language' or subtag_type == 'grandfathered':
594					if line.startswith ('Scope: '):
595						scope = line.split (' ')[1]
596						if scope == 'macrolanguage':
597							scope = ' [macrolanguage]'
598						elif scope == 'collection':
599							scope = ' [collection]'
600						else:
601							continue
602						self.scopes[subtag] = scope
603					elif line.startswith ('Deprecated: '):
604						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
605						deprecated = True
606					elif deprecated and line.startswith ('Comments: see '):
607						# If a subtag is split into multiple replacement subtags,
608						# it essentially represents a macrolanguage.
609						for language in line.replace (',', '').split (' ')[2:]:
610							self._add_macrolanguage (subtag, language)
611					elif line.startswith ('Preferred-Value: '):
612						# If a subtag is deprecated in favor of a single replacement subtag,
613						# it is either a dialect or synonym of the preferred subtag. Either
614						# way, it is close enough to the truth to consider the replacement
615						# the macrolanguage of the deprecated language.
616						has_preferred_value = True
617						macrolanguage = line.split (' ')[1]
618						self._add_macrolanguage (macrolanguage, subtag)
619					elif not has_preferred_value and line.startswith ('Macrolanguage: '):
620						self._add_macrolanguage (line.split (' ')[1], subtag)
621				elif subtag_type == 'variant':
622					if line.startswith ('Deprecated: '):
623						self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
624					elif line.startswith ('Prefix: '):
625						self.prefixes[subtag].add (line.split (' ')[1])
626				elif line.startswith ('File-Date: '):
627					self.header = line
628		expect (self.header)
629
630	def _add_macrolanguage (self, macrolanguage, language):
631		global ot
632		if language not in ot.from_bcp_47:
633			for l in self.macrolanguages.get (language, set ()):
634				self._add_macrolanguage (macrolanguage, l)
635		if macrolanguage not in ot.from_bcp_47:
636			for ls in list (self.macrolanguages.values ()):
637				if macrolanguage in ls:
638					ls.add (language)
639					return
640		self.macrolanguages[macrolanguage].add (language)
641
642	def remove_extra_macrolanguages (self):
643		"""Make every language have at most one macrolanguage."""
644		inverted = collections.defaultdict (list)
645		for macrolanguage, languages in self.macrolanguages.items ():
646			for language in languages:
647				inverted[language].append (macrolanguage)
648		for language, macrolanguages in inverted.items ():
649			if len (macrolanguages) > 1:
650				macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
651				biggest_macrolanguage = macrolanguages.pop ()
652				for macrolanguage in macrolanguages:
653					self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
654
655	def _get_name_piece (self, subtag):
656		"""Return the first name of a subtag plus its scope suffix.
657
658		Args:
659			subtag (str): A BCP 47 subtag.
660
661		Returns:
662			The name form of ``subtag``.
663		"""
664		return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
665
666	def get_name (self, lt):
667		"""Return the names of the subtags in a language tag.
668
669		Args:
670			lt (LanguageTag): A BCP 47 language tag.
671
672		Returns:
673			The name form of ``lt``.
674		"""
675		name = self._get_name_piece (lt.language)
676		if lt.script:
677			name += '; ' + self._get_name_piece (lt.script.title ())
678		if lt.region:
679			name += '; ' + self._get_name_piece (lt.region.upper ())
680		if lt.variant:
681			name += '; ' + self._get_name_piece (lt.variant)
682		return name
683
684bcp_47 = BCP47Parser ()
685
686ot.parse (sys.argv[1])
687bcp_47.parse (sys.argv[2])
688
689ot.add_language ('ary', 'MOR')
690
691ot.add_language ('ath', 'ATH')
692
693ot.add_language ('bai', 'BML')
694
695ot.ranks['BAL'] = ot.ranks['KAR'] + 1
696
697ot.add_language ('ber', 'BBR')
698
699ot.remove_language_ot ('PGR')
700ot.add_language ('el-polyton', 'PGR')
701
702bcp_47.macrolanguages['et'] = {'ekk'}
703
704bcp_47.names['flm'] = 'Falam Chin'
705bcp_47.scopes['flm'] = ' (retired code)'
706bcp_47.macrolanguages['flm'] = {'cfm'}
707
708ot.ranks['FNE'] = ot.ranks['TNE'] + 1
709
710ot.add_language ('und-fonipa', 'IPPH')
711
712ot.add_language ('und-fonnapa', 'APPH')
713
714ot.remove_language_ot ('IRT')
715ot.add_language ('ga-Latg', 'IRT')
716
717ot.add_language ('hy-arevmda', 'HYE')
718
719ot.remove_language_ot ('KGE')
720ot.add_language ('und-Geok', 'KGE')
721
722bcp_47.macrolanguages['id'] = {'in'}
723
724bcp_47.macrolanguages['ijo'] = {'ijc'}
725
726ot.add_language ('kht', 'KHN')
727ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
728ot.ranks['KHN'] = ot.ranks['KHT'] + 1
729
730ot.ranks['LCR'] = ot.ranks['MCR'] + 1
731
732ot.names['MAL'] = 'Malayalam Traditional'
733ot.ranks['MLR'] += 1
734
735bcp_47.names['mhv'] = 'Arakanese'
736bcp_47.scopes['mhv'] = ' (retired code)'
737
738ot.add_language ('mnw-TH', 'MONT')
739
740ot.add_language ('no', 'NOR')
741
742ot.add_language ('oc-provenc', 'PRO')
743
744ot.remove_language_ot ('QUZ')
745ot.add_language ('qu', 'QUZ')
746ot.add_language ('qub', 'QWH')
747ot.add_language ('qud', 'QVI')
748ot.add_language ('qug', 'QVI')
749ot.add_language ('qul', 'QUH')
750ot.add_language ('qup', 'QVI')
751ot.add_language ('qur', 'QWH')
752ot.add_language ('qus', 'QUH')
753ot.add_language ('quw', 'QVI')
754ot.add_language ('qux', 'QWH')
755ot.add_language ('qva', 'QWH')
756ot.add_language ('qvh', 'QWH')
757ot.add_language ('qvj', 'QVI')
758ot.add_language ('qvl', 'QWH')
759ot.add_language ('qvm', 'QWH')
760ot.add_language ('qvn', 'QWH')
761ot.add_language ('qvo', 'QVI')
762ot.add_language ('qvp', 'QWH')
763ot.add_language ('qvw', 'QWH')
764ot.add_language ('qvz', 'QVI')
765ot.add_language ('qwa', 'QWH')
766ot.add_language ('qws', 'QWH')
767ot.add_language ('qxa', 'QWH')
768ot.add_language ('qxc', 'QWH')
769ot.add_language ('qxh', 'QWH')
770ot.add_language ('qxl', 'QVI')
771ot.add_language ('qxn', 'QWH')
772ot.add_language ('qxo', 'QWH')
773ot.add_language ('qxr', 'QVI')
774ot.add_language ('qxt', 'QWH')
775ot.add_language ('qxw', 'QWH')
776
777bcp_47.macrolanguages['ro-MD'].add ('mo')
778
779ot.remove_language_ot ('SYRE')
780ot.remove_language_ot ('SYRJ')
781ot.remove_language_ot ('SYRN')
782ot.add_language ('und-Syre', 'SYRE')
783ot.add_language ('und-Syrj', 'SYRJ')
784ot.add_language ('und-Syrn', 'SYRN')
785
786bcp_47.names['xst'] = "Silt'e"
787bcp_47.scopes['xst'] = ' (retired code)'
788bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
789
790ot.add_language ('xwo', 'TOD')
791
792ot.remove_language_ot ('ZHH')
793ot.remove_language_ot ('ZHP')
794ot.remove_language_ot ('ZHT')
795ot.remove_language_ot ('ZHTM')
796bcp_47.macrolanguages['zh'].remove ('lzh')
797bcp_47.macrolanguages['zh'].remove ('yue')
798ot.add_language ('zh-Hant-MO', 'ZHH')
799ot.add_language ('zh-Hant-MO', 'ZHTM')
800ot.add_language ('zh-Hant-HK', 'ZHH')
801ot.add_language ('zh-Hans', 'ZHS')
802ot.add_language ('zh-Hant', 'ZHT')
803ot.add_language ('zh-HK', 'ZHH')
804ot.add_language ('zh-MO', 'ZHH')
805ot.add_language ('zh-MO', 'ZHTM')
806ot.add_language ('zh-TW', 'ZHT')
807ot.add_language ('lzh', 'ZHT')
808ot.add_language ('lzh-Hans', 'ZHS')
809ot.add_language ('yue', 'ZHH')
810ot.add_language ('yue-Hans', 'ZHS')
811
812bcp_47.macrolanguages['zom'] = {'yos'}
813
814def rank_delta (bcp_47, ot):
815	"""Return a delta to apply to a BCP 47 tag's rank.
816
817	Most OpenType tags have a constant rank, but a few have ranks that
818	depend on the BCP 47 tag.
819
820	Args:
821		bcp_47 (str): A BCP 47 tag.
822		ot (str): An OpenType tag to.
823
824	Returns:
825		A number to add to ``ot``'s rank when sorting ``bcp_47``'s
826		OpenType equivalents.
827	"""
828	if bcp_47 == 'ak' and ot == 'AKA':
829		return -1
830	if bcp_47 == 'tw' and ot == 'TWI':
831		return -1
832	return 0
833
834disambiguation = {
835	'ALT': 'alt',
836	'ARK': 'rki',
837	'ATH': 'ath',
838	'BHI': 'bhb',
839	'BLN': 'bjt',
840	'BTI': 'beb',
841	'CCHN': 'cco',
842	'CMR': 'swb',
843	'CPP': 'crp',
844	'CRR': 'crx',
845	'DUJ': 'dwu',
846	'ECR': 'crj',
847	'HAL': 'cfm',
848	'HND': 'hnd',
849	'HYE': 'hyw',
850	'KIS': 'kqs',
851	'KUI': 'uki',
852	'LRC': 'bqi',
853	'NDB': 'nd',
854	'NIS': 'njz',
855	'PLG': 'pce',
856	'PRO': 'pro',
857	'QIN': 'bgr',
858	'QUH': 'quh',
859	'QVI': 'qvi',
860	'QWH': 'qwh',
861	'SIG': 'stv',
862	'SRB': 'sr',
863	'SXT': 'xnj',
864	'ZHH': 'zh-HK',
865	'ZHS': 'zh-Hans',
866	'ZHT': 'zh-Hant',
867	'ZHTM': 'zh-MO',
868}
869
870ot.inherit_from_macrolanguages ()
871bcp_47.remove_extra_macrolanguages ()
872ot.inherit_from_macrolanguages ()
873ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
874ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
875for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
876	possible_bcp_47_tag = tricky_ot_tag.lower ()
877	if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
878		ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
879		bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
880ot.sort_languages ()
881
882print ('/* == Start of generated table == */')
883print ('/*')
884print (' * The following table is generated by running:')
885print (' *')
886print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
887print (' *')
888print (' * on files with these headers:')
889print (' *')
890print (' * %s' % ot.header.strip ())
891print (' * %s' % bcp_47.header)
892print (' */')
893print ()
894print ('#ifndef HB_OT_TAG_TABLE_HH')
895print ('#define HB_OT_TAG_TABLE_HH')
896print ()
897
898def hb_tag (tag):
899	"""Convert a tag to ``HB_TAG`` form.
900
901	Args:
902		tag (str): An OpenType tag.
903
904	Returns:
905		A snippet of C++ representing ``tag``.
906	"""
907	if tag == DEFAULT_LANGUAGE_SYSTEM:
908		return 'HB_TAG_NONE\t       '
909	return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
910
911def get_variant_set (name):
912	"""Return a set of variant language names from a name.
913
914	Args:
915		name (str): A list of language names from the BCP 47 registry,
916			joined on ``'\\n'``.
917
918	Returns:
919		A set of normalized language names.
920	"""
921	return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
922			.encode ('ASCII', 'ignore')
923			.strip ()
924			for n in re.split ('[\n(),]', name) if n)
925
926def language_name_intersection (a, b):
927	"""Return the names in common between two language names.
928
929	Args:
930		a (str): A list of language names from the BCP 47 registry,
931			joined on ``'\\n'``.
932		b (str): A list of language names from the BCP 47 registry,
933			joined on ``'\\n'``.
934
935	Returns:
936		The normalized language names shared by ``a`` and ``b``.
937	"""
938	return get_variant_set (a).intersection (get_variant_set (b))
939
940def get_matching_language_name (intersection, candidates):
941	return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
942
943def same_tag (bcp_47_tag, ot_tags):
944	return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
945
946for language_len in (2, 3):
947	if language_len == 3:
948		print ('#ifndef HB_NO_LANGUAGE_LONG')
949	print ('static const LangTag ot_languages%d[] = {' % language_len)
950	for language, tags in sorted (ot.from_bcp_47.items ()):
951		if language == '' or '-' in language:
952			continue
953		if len(language) != language_len: continue
954		commented_out = same_tag (language, tags)
955		for i, tag in enumerate (tags, start=1):
956			print ('%s{%s,\t%s},' % ('/*' if commented_out else '  ', hb_tag (language), hb_tag (tag)), end='')
957			if commented_out:
958				print ('*/', end='')
959			print ('\t/* ', end='')
960			bcp_47_name = bcp_47.names.get (language, '')
961			bcp_47_name_candidates = bcp_47_name.split ('\n')
962			ot_name = ot.names[tag]
963			scope = bcp_47.scopes.get (language, '')
964			if tag == DEFAULT_LANGUAGE_SYSTEM:
965				write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
966			else:
967				intersection = language_name_intersection (bcp_47_name, ot_name)
968				if not intersection:
969					write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
970				else:
971					name = get_matching_language_name (intersection, bcp_47_name_candidates)
972					bcp_47.names[language] = name
973					write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
974			print (' */')
975	print ('};')
976	if language_len == 3:
977		print ('#endif')
978	print ()
979
980print ('/**')
981print (' * hb_ot_tags_from_complex_language:')
982print (' * @lang_str: a BCP 47 language tag to convert.')
983print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
984print (' * conversion.')
985print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
986print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
987print (' * @tags: array of size at least @language_count to store the language tag')
988print (' * results')
989print (' *')
990print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
991print (' *')
992print (' * Return value: Whether any language systems were retrieved.')
993print (' **/')
994print ('static inline bool')
995print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
996print ('\t\t\t\t  const char   *limit,')
997print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
998print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
999print ('{')
1000
1001def print_subtag_matches (subtag, string, new_line):
1002	if subtag:
1003		if new_line:
1004			print ()
1005			print ('\t&& ', end='')
1006		print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='')
1007
1008complex_tags = collections.defaultdict (list)
1009for initial, group in itertools.groupby ((lt_tags for lt_tags in [
1010			(LanguageTag (language), tags)
1011			for language, tags in sorted (ot.from_bcp_47.items (),
1012				key=lambda i: (-len (i[0]), i[0]))
1013		] if lt_tags[0].is_complex ()),
1014		key=lambda lt_tags: lt_tags[0].get_group ()):
1015	complex_tags[initial] += group
1016
1017# Calculate the min length of the subtags outside the switch
1018min_subtag_len = 100
1019for initial, items in sorted (complex_tags.items ()):
1020	if initial != 'und':
1021		continue
1022	for lt, tags in items:
1023		if not tags:
1024			continue
1025		subtag_len = 0
1026		subtag_len += 1 + len (lt.script) if lt.script is not None else 0
1027		subtag_len += 1 + len (lt.region) if lt.region is not None else 0
1028		subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0
1029		min_subtag_len = min(subtag_len, min_subtag_len)
1030
1031print ('  if (limit - lang_str >= %d)' % (min_subtag_len + 2))
1032print ('  {')
1033print ("    const char *p = strchr (lang_str, '-');")
1034print ("    if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len)
1035for initial, items in sorted (complex_tags.items ()):
1036	if initial != 'und':
1037		continue
1038	for lt, tags in items:
1039		if not tags:
1040			continue
1041		if lt.variant in bcp_47.prefixes:
1042			expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
1043					'%s is not a valid prefix of %s' % (lt.language, lt.variant))
1044		print ('    if (', end='')
1045		print_subtag_matches (lt.script, 'p', False)
1046		print_subtag_matches (lt.region, 'p', False)
1047		print_subtag_matches (lt.variant, 'p', False)
1048		print (')')
1049		print ('    {')
1050		write ('      /* %s */' % bcp_47.get_name (lt))
1051		print ()
1052		if len (tags) == 1:
1053			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1054			print ()
1055			print ('      *count = 1;')
1056		else:
1057			print ('    hb_tag_t possible_tags[] = {')
1058			for tag in tags:
1059				write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1060				print ()
1061			print ('      };')
1062			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1063			print ('\ttags[i] = possible_tags[i];')
1064			print ('      *count = i;')
1065		print ('      return true;')
1066		print ('    }')
1067print ('  }')
1068print ('out:')
1069
1070print ('  switch (lang_str[0])')
1071print ('  {')
1072for initial, items in sorted (complex_tags.items ()):
1073	if initial == 'und':
1074		continue
1075	print ("  case '%s':" % initial)
1076	for lt, tags in items:
1077		if not tags:
1078			continue
1079		print ('    if (', end='')
1080		script = lt.script
1081		region = lt.region
1082		if lt.grandfathered:
1083			print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1084		else:
1085			string_literal = lt.language[1:] + '-'
1086			if script:
1087				string_literal += script
1088				script = None
1089				if region:
1090					string_literal += '-' + region
1091					region = None
1092			if string_literal[-1] == '-':
1093				print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1094			else:
1095				print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='')
1096		print_subtag_matches (script, 'lang_str', True)
1097		print_subtag_matches (region, 'lang_str', True)
1098		print_subtag_matches (lt.variant, 'lang_str', True)
1099		print (')')
1100		print ('    {')
1101		write ('      /* %s */' % bcp_47.get_name (lt))
1102		print ()
1103		if len (tags) == 1:
1104			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1105			print ()
1106			print ('      *count = 1;')
1107		else:
1108			print ('      unsigned int i;')
1109			print ('      hb_tag_t possible_tags[] = {')
1110			for tag in tags:
1111				write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
1112				print ()
1113			print ('      };')
1114			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
1115			print ('\ttags[i] = possible_tags[i];')
1116			print ('      *count = i;')
1117		print ('      return true;')
1118		print ('    }')
1119	print ('    break;')
1120
1121print ('  }')
1122print ('  return false;')
1123print ('}')
1124print ()
1125print ('/**')
1126print (' * hb_ot_ambiguous_tag_to_language')
1127print (' * @tag: A language tag.')
1128print (' *')
1129print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1130print (' * many language tags) and the best tag is not the alphabetically first, or if')
1131print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1132print (' * in #ot_languages.')
1133print (' *')
1134print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1135print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1136print (' **/')
1137print ('static inline hb_language_t')
1138print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1139print ('{')
1140print ('  switch (tag)')
1141print ('  {')
1142
1143def verify_disambiguation_dict ():
1144	"""Verify and normalize ``disambiguation``.
1145
1146	``disambiguation`` is a map of ambiguous OpenType language system
1147	tags to the particular BCP 47 tags they correspond to. This function
1148	checks that all its keys really are ambiguous and that each key's
1149	value is valid for that key. It checks that no ambiguous tag is
1150	missing, except when it can figure out which BCP 47 tag is the best
1151	by itself.
1152
1153	It modifies ``disambiguation`` to remove keys whose values are the
1154	same as those that the fallback would return anyway, and to add
1155	ambiguous keys whose disambiguations it determined automatically.
1156
1157	Raises:
1158		AssertionError: Verification failed.
1159	"""
1160	global bcp_47
1161	global disambiguation
1162	global ot
1163	for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1164		if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
1165			primary_tags = []
1166		else:
1167			primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1168		if len (primary_tags) == 1:
1169			expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1170			if '-' in primary_tags[0]:
1171				disambiguation[ot_tag] = primary_tags[0]
1172			else:
1173				first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
1174				if primary_tags[0] != first_tag:
1175					disambiguation[ot_tag] = primary_tags[0]
1176		elif len (primary_tags) == 0:
1177			expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1178		else:
1179			original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
1180			if len (original_languages) == 1:
1181				macrolanguages = original_languages
1182			else:
1183				macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
1184			if len (macrolanguages) != 1:
1185				macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
1186			if len (macrolanguages) != 1:
1187				macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1188			if len (macrolanguages) != 1:
1189				expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1190				expect (disambiguation[ot_tag] in bcp_47_tags,
1191						'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1192			elif ot_tag not in disambiguation:
1193				disambiguation[ot_tag] = macrolanguages[0]
1194			different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1195			if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
1196				del disambiguation[ot_tag]
1197	for ot_tag in disambiguation.keys ():
1198		expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1199
1200verify_disambiguation_dict ()
1201for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1202	write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1203	print ()
1204	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1205	print ()
1206
1207print ('  default:')
1208print ('    return HB_LANGUAGE_INVALID;')
1209print ('  }')
1210print ('}')
1211
1212print ()
1213print ('#endif /* HB_OT_TAG_TABLE_HH */')
1214print ()
1215print ('/* == End of generated table == */')
1216
1217