1#!/usr/bin/env python3 2 3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice 4versa. 5 6It creates a ``const LangTag[]``, matching the tags from the OpenType 7languages system tag list to the language subtags of the BCP 47 language 8subtag registry, with some manual adjustments. The mappings are 9supplemented with macrolanguages' sublanguages and retired codes' 10replacements, according to BCP 47 and some manual additions where BCP 47 11omits a retired code entirely. 12 13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, 14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags 15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to 16multiple BCP 47 tags) are listed here, except when the alphabetically 17first BCP 47 tag happens to be the chosen disambiguated tag. In that 18case, the fallback behavior will choose the right tag anyway. 19 20usage: ./gen-tag-table.py languagetags language-subtag-registry 21 22Input files: 23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags 24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 25""" 26 27import collections 28import html 29from html.parser import HTMLParser 30import itertools 31import re 32import sys 33import unicodedata 34 35if len (sys.argv) != 3: 36 sys.exit (__doc__) 37 38def expect (condition, message=None): 39 if not condition: 40 if message is None: 41 raise AssertionError 42 raise AssertionError (message) 43 44def write (s): 45 sys.stdout.flush () 46 sys.stdout.buffer.write (s.encode ('utf-8')) 47 48DEFAULT_LANGUAGE_SYSTEM = '' 49 50# from https://www-01.sil.org/iso639-3/iso-639-3.tab 51ISO_639_3_TO_1 = { 52 'aar': 'aa', 53 'abk': 'ab', 54 'afr': 'af', 55 'aka': 'ak', 56 'amh': 'am', 57 'ara': 'ar', 58 'arg': 'an', 59 'asm': 'as', 60 'ava': 'av', 61 'ave': 'ae', 62 'aym': 'ay', 63 'aze': 'az', 64 'bak': 'ba', 65 'bam': 'bm', 66 'bel': 'be', 67 'ben': 'bn', 68 'bis': 'bi', 69 'bod': 'bo', 70 'bos': 'bs', 71 'bre': 'br', 72 'bul': 'bg', 73 'cat': 'ca', 74 'ces': 'cs', 75 'cha': 'ch', 76 'che': 'ce', 77 'chu': 'cu', 78 'chv': 'cv', 79 'cor': 'kw', 80 'cos': 'co', 81 'cre': 'cr', 82 'cym': 'cy', 83 'dan': 'da', 84 'deu': 'de', 85 'div': 'dv', 86 'dzo': 'dz', 87 'ell': 'el', 88 'eng': 'en', 89 'epo': 'eo', 90 'est': 'et', 91 'eus': 'eu', 92 'ewe': 'ee', 93 'fao': 'fo', 94 'fas': 'fa', 95 'fij': 'fj', 96 'fin': 'fi', 97 'fra': 'fr', 98 'fry': 'fy', 99 'ful': 'ff', 100 'gla': 'gd', 101 'gle': 'ga', 102 'glg': 'gl', 103 'glv': 'gv', 104 'grn': 'gn', 105 'guj': 'gu', 106 'hat': 'ht', 107 'hau': 'ha', 108 'hbs': 'sh', 109 'heb': 'he', 110 'her': 'hz', 111 'hin': 'hi', 112 'hmo': 'ho', 113 'hrv': 'hr', 114 'hun': 'hu', 115 'hye': 'hy', 116 'ibo': 'ig', 117 'ido': 'io', 118 'iii': 'ii', 119 'iku': 'iu', 120 'ile': 'ie', 121 'ina': 'ia', 122 'ind': 'id', 123 'ipk': 'ik', 124 'isl': 'is', 125 'ita': 'it', 126 'jav': 'jv', 127 'jpn': 'ja', 128 'kal': 'kl', 129 'kan': 'kn', 130 'kas': 'ks', 131 'kat': 'ka', 132 'kau': 'kr', 133 'kaz': 'kk', 134 'khm': 'km', 135 'kik': 'ki', 136 'kin': 'rw', 137 'kir': 'ky', 138 'kom': 'kv', 139 'kon': 'kg', 140 'kor': 'ko', 141 'kua': 'kj', 142 'kur': 'ku', 143 'lao': 'lo', 144 'lat': 'la', 145 'lav': 'lv', 146 'lim': 'li', 147 'lin': 'ln', 148 'lit': 'lt', 149 'ltz': 'lb', 150 'lub': 'lu', 151 'lug': 'lg', 152 'mah': 'mh', 153 'mal': 'ml', 154 'mar': 'mr', 155 'mkd': 'mk', 156 'mlg': 'mg', 157 'mlt': 'mt', 158 'mol': 'mo', 159 'mon': 'mn', 160 'mri': 'mi', 161 'msa': 'ms', 162 'mya': 'my', 163 'nau': 'na', 164 'nav': 'nv', 165 'nbl': 'nr', 166 'nde': 'nd', 167 'ndo': 'ng', 168 'nep': 'ne', 169 'nld': 'nl', 170 'nno': 'nn', 171 'nob': 'nb', 172 'nor': 'no', 173 'nya': 'ny', 174 'oci': 'oc', 175 'oji': 'oj', 176 'ori': 'or', 177 'orm': 'om', 178 'oss': 'os', 179 'pan': 'pa', 180 'pli': 'pi', 181 'pol': 'pl', 182 'por': 'pt', 183 'pus': 'ps', 184 'que': 'qu', 185 'roh': 'rm', 186 'ron': 'ro', 187 'run': 'rn', 188 'rus': 'ru', 189 'sag': 'sg', 190 'san': 'sa', 191 'sin': 'si', 192 'slk': 'sk', 193 'slv': 'sl', 194 'sme': 'se', 195 'smo': 'sm', 196 'sna': 'sn', 197 'snd': 'sd', 198 'som': 'so', 199 'sot': 'st', 200 'spa': 'es', 201 'sqi': 'sq', 202 'srd': 'sc', 203 'srp': 'sr', 204 'ssw': 'ss', 205 'sun': 'su', 206 'swa': 'sw', 207 'swe': 'sv', 208 'tah': 'ty', 209 'tam': 'ta', 210 'tat': 'tt', 211 'tel': 'te', 212 'tgk': 'tg', 213 'tgl': 'tl', 214 'tha': 'th', 215 'tir': 'ti', 216 'ton': 'to', 217 'tsn': 'tn', 218 'tso': 'ts', 219 'tuk': 'tk', 220 'tur': 'tr', 221 'twi': 'tw', 222 'uig': 'ug', 223 'ukr': 'uk', 224 'urd': 'ur', 225 'uzb': 'uz', 226 'ven': 've', 227 'vie': 'vi', 228 'vol': 'vo', 229 'wln': 'wa', 230 'wol': 'wo', 231 'xho': 'xh', 232 'yid': 'yi', 233 'yor': 'yo', 234 'zha': 'za', 235 'zho': 'zh', 236 'zul': 'zu', 237} 238 239class LanguageTag (object): 240 """A BCP 47 language tag. 241 242 Attributes: 243 subtags (List[str]): The list of subtags in this tag. 244 grandfathered (bool): Whether this tag is grandfathered. If 245 ``true``, the entire lowercased tag is the ``language`` 246 and the other subtag fields are empty. 247 language (str): The language subtag. 248 script (str): The script subtag. 249 region (str): The region subtag. 250 variant (str): The variant subtag. 251 252 Args: 253 tag (str): A BCP 47 language tag. 254 255 """ 256 def __init__ (self, tag): 257 global bcp_47 258 self.subtags = tag.lower ().split ('-') 259 self.grandfathered = tag.lower () in bcp_47.grandfathered 260 if self.grandfathered: 261 self.language = tag.lower () 262 self.script = '' 263 self.region = '' 264 self.variant = '' 265 else: 266 self.language = self.subtags[0] 267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) 268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) 269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) 270 271 def __str__(self): 272 return '-'.join(self.subtags) 273 274 def __repr__ (self): 275 return 'LanguageTag(%r)' % str(self) 276 277 @staticmethod 278 def _find_first (function, sequence): 279 try: 280 return next (iter (filter (function, sequence))) 281 except StopIteration: 282 return None 283 284 def is_complex (self): 285 """Return whether this tag is too complex to represent as a 286 ``LangTag`` in the generated code. 287 288 Complex tags need to be handled in 289 ``hb_ot_tags_from_complex_language``. 290 291 Returns: 292 Whether this tag is complex. 293 """ 294 return not (len (self.subtags) == 1 295 or self.grandfathered 296 and len (self.subtags[1]) != 3 297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) 298 299 def get_group (self): 300 """Return the group into which this tag should be categorized in 301 ``hb_ot_tags_from_complex_language``. 302 303 The group is the first letter of the tag, or ``'und'`` if this tag 304 should not be matched in a ``switch`` statement in the generated 305 code. 306 307 Returns: 308 This tag's group. 309 """ 310 return ('und' 311 if (self.language == 'und' 312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) 313 else self.language[0]) 314 315class OpenTypeRegistryParser (HTMLParser): 316 """A parser for the OpenType language system tag registry. 317 318 Attributes: 319 header (str): The "last updated" line of the registry. 320 names (Mapping[str, str]): A map of language system tags to the 321 names they are given in the registry. 322 ranks (DefaultDict[str, int]): A map of language system tags to 323 numbers. If a single BCP 47 tag corresponds to multiple 324 OpenType tags, the tags are ordered in increasing order by 325 rank. The rank is based on the number of BCP 47 tags 326 associated with a tag, though it may be manually modified. 327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of 328 OpenType language system tags to sets of BCP 47 tags. 329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` 330 inverted. Its values start as unsorted sets; 331 ``sort_languages`` converts them to sorted lists. 332 333 """ 334 def __init__ (self): 335 HTMLParser.__init__ (self) 336 self.header = '' 337 self.names = {} 338 self.ranks = collections.defaultdict (int) 339 self.to_bcp_47 = collections.defaultdict (set) 340 self.from_bcp_47 = collections.defaultdict (set) 341 # Whether the parser is in a <td> element 342 self._td = False 343 # The text of the <td> elements of the current <tr> element. 344 self._current_tr = [] 345 346 def handle_starttag (self, tag, attrs): 347 if tag == 'meta': 348 for attr, value in attrs: 349 if attr == 'name' and value == 'updated_at': 350 self.header = self.get_starttag_text () 351 break 352 elif tag == 'td': 353 self._td = True 354 self._current_tr.append ('') 355 elif tag == 'tr': 356 self._current_tr = [] 357 358 def handle_endtag (self, tag): 359 if tag == 'td': 360 self._td = False 361 elif tag == 'tr' and self._current_tr: 362 expect (2 <= len (self._current_tr) <= 3) 363 name = self._current_tr[0].strip () 364 tag = self._current_tr[1].strip ("\t\n\v\f\r '") 365 rank = 0 366 if len (tag) > 4: 367 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) 368 name += ' (deprecated)' 369 tag = tag.split (' ')[0] 370 rank = 1 371 self.names[tag] = re.sub (' languages$', '', name) 372 if not self._current_tr[2]: 373 return 374 iso_codes = self._current_tr[2].strip () 375 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) 376 rank += 2 * len (self.to_bcp_47[tag]) 377 self.ranks[tag] = rank 378 379 def handle_data (self, data): 380 if self._td: 381 self._current_tr[-1] += data 382 383 def handle_charref (self, name): 384 self.handle_data (html.unescape ('&#%s;' % name)) 385 386 def handle_entityref (self, name): 387 self.handle_data (html.unescape ('&%s;' % name)) 388 389 def parse (self, filename): 390 """Parse the OpenType language system tag registry. 391 392 Args: 393 filename (str): The file name of the registry. 394 """ 395 with open (filename, encoding='utf-8') as f: 396 self.feed (f.read ()) 397 expect (self.header) 398 for tag, iso_codes in self.to_bcp_47.items (): 399 for iso_code in iso_codes: 400 self.from_bcp_47[iso_code].add (tag) 401 402 def add_language (self, bcp_47_tag, ot_tag): 403 """Add a language as if it were in the registry. 404 405 Args: 406 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just 407 a language subtag, and if the language subtag is a 408 macrolanguage, then new languages are added corresponding 409 to the macrolanguages' individual languages with the 410 remainder of the tag appended. 411 ot_tag (str): An OpenType language system tag. 412 """ 413 global bcp_47 414 self.to_bcp_47[ot_tag].add (bcp_47_tag) 415 self.from_bcp_47[bcp_47_tag].add (ot_tag) 416 if bcp_47_tag.lower () not in bcp_47.grandfathered: 417 try: 418 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) 419 if macrolanguage in bcp_47.macrolanguages: 420 s = set () 421 for language in bcp_47.macrolanguages[macrolanguage]: 422 if language.lower () not in bcp_47.grandfathered: 423 s.add ('%s-%s' % (language, suffix)) 424 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s 425 except ValueError: 426 pass 427 428 @staticmethod 429 def _remove_language (tag_1, dict_1, dict_2): 430 for tag_2 in dict_1.pop (tag_1): 431 dict_2[tag_2].remove (tag_1) 432 if not dict_2[tag_2]: 433 del dict_2[tag_2] 434 435 def remove_language_ot (self, ot_tag): 436 """Remove an OpenType tag from the registry. 437 438 Args: 439 ot_tag (str): An OpenType tag. 440 """ 441 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) 442 443 def remove_language_bcp_47 (self, bcp_47_tag): 444 """Remove a BCP 47 tag from the registry. 445 446 Args: 447 bcp_47_tag (str): A BCP 47 tag. 448 """ 449 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) 450 451 def inherit_from_macrolanguages (self): 452 """Copy mappings from macrolanguages to individual languages. 453 454 If a BCP 47 tag for an individual mapping has no OpenType 455 mapping but its macrolanguage does, the mapping is copied to 456 the individual language. For example, als (Tosk Albanian) has no 457 explicit mapping, so it inherits from sq (Albanian) the mapping 458 to SQI. 459 460 If a BCP 47 tag for a macrolanguage has no OpenType mapping but 461 all of its individual languages do and they all map to the same 462 tags, the mapping is copied to the macrolanguage. 463 """ 464 global bcp_47 465 original_ot_from_bcp_47 = dict (self.from_bcp_47) 466 for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): 467 ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) 468 if ot_macrolanguages: 469 for ot_macrolanguage in ot_macrolanguages: 470 for language in languages: 471 self.add_language (language, ot_macrolanguage) 472 self.ranks[ot_macrolanguage] += 1 473 else: 474 for language in languages: 475 if language in original_ot_from_bcp_47: 476 if ot_macrolanguages: 477 ml = original_ot_from_bcp_47[language] 478 if ml: 479 ot_macrolanguages &= ml 480 else: 481 pass 482 else: 483 ot_macrolanguages |= original_ot_from_bcp_47[language] 484 else: 485 ot_macrolanguages.clear () 486 if not ot_macrolanguages: 487 break 488 for ot_macrolanguage in ot_macrolanguages: 489 self.add_language (macrolanguage, ot_macrolanguage) 490 491 def sort_languages (self): 492 """Sort the values of ``from_bcp_47`` in ascending rank order.""" 493 for language, tags in self.from_bcp_47.items (): 494 self.from_bcp_47[language] = sorted (tags, 495 key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) 496 497ot = OpenTypeRegistryParser () 498 499class BCP47Parser (object): 500 """A parser for the BCP 47 subtag registry. 501 502 Attributes: 503 header (str): The "File-Date" line of the registry. 504 names (Mapping[str, str]): A map of subtags to the names they 505 are given in the registry. Each value is a 506 ``'\\n'``-separated list of names. 507 scopes (Mapping[str, str]): A map of language subtags to strings 508 suffixed to language names, including suffixes to explain 509 language scopes. 510 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of 511 language subtags to the sets of language subtags which 512 inherit from them. See 513 ``OpenTypeRegistryParser.inherit_from_macrolanguages``. 514 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant 515 subtags to their prefixes. 516 grandfathered (AbstractSet[str]): The set of grandfathered tags, 517 normalized to lowercase. 518 519 """ 520 def __init__ (self): 521 self.header = '' 522 self.names = {} 523 self.scopes = {} 524 self.macrolanguages = collections.defaultdict (set) 525 self.prefixes = collections.defaultdict (set) 526 self.grandfathered = set () 527 528 def parse (self, filename): 529 """Parse the BCP 47 subtag registry. 530 531 Args: 532 filename (str): The file name of the registry. 533 """ 534 with open (filename, encoding='utf-8') as f: 535 subtag_type = None 536 subtag = None 537 deprecated = False 538 has_preferred_value = False 539 line_buffer = '' 540 for line in itertools.chain (f, ['']): 541 line = line.rstrip () 542 if line.startswith (' '): 543 line_buffer += line[1:] 544 continue 545 line, line_buffer = line_buffer, line 546 if line.startswith ('Type: '): 547 subtag_type = line.split (' ')[1] 548 deprecated = False 549 has_preferred_value = False 550 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): 551 subtag = line.split (' ')[1] 552 if subtag_type == 'grandfathered': 553 self.grandfathered.add (subtag.lower ()) 554 elif line.startswith ('Description: '): 555 description = line.split (' ', 1)[1].replace (' (individual language)', '') 556 description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '', 557 description) 558 if subtag in self.names: 559 self.names[subtag] += '\n' + description 560 else: 561 self.names[subtag] = description 562 elif subtag_type == 'language' or subtag_type == 'grandfathered': 563 if line.startswith ('Scope: '): 564 scope = line.split (' ')[1] 565 if scope == 'macrolanguage': 566 scope = ' [macrolanguage]' 567 elif scope == 'collection': 568 scope = ' [family]' 569 else: 570 continue 571 self.scopes[subtag] = scope 572 elif line.startswith ('Deprecated: '): 573 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 574 deprecated = True 575 elif deprecated and line.startswith ('Comments: see '): 576 # If a subtag is split into multiple replacement subtags, 577 # it essentially represents a macrolanguage. 578 for language in line.replace (',', '').split (' ')[2:]: 579 self._add_macrolanguage (subtag, language) 580 elif line.startswith ('Preferred-Value: '): 581 # If a subtag is deprecated in favor of a single replacement subtag, 582 # it is either a dialect or synonym of the preferred subtag. Either 583 # way, it is close enough to the truth to consider the replacement 584 # the macrolanguage of the deprecated language. 585 has_preferred_value = True 586 macrolanguage = line.split (' ')[1] 587 self._add_macrolanguage (macrolanguage, subtag) 588 elif not has_preferred_value and line.startswith ('Macrolanguage: '): 589 self._add_macrolanguage (line.split (' ')[1], subtag) 590 elif subtag_type == 'variant': 591 if line.startswith ('Deprecated: '): 592 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 593 elif line.startswith ('Prefix: '): 594 self.prefixes[subtag].add (line.split (' ')[1]) 595 elif line.startswith ('File-Date: '): 596 self.header = line 597 expect (self.header) 598 599 def _add_macrolanguage (self, macrolanguage, language): 600 global ot 601 if language not in ot.from_bcp_47: 602 for l in self.macrolanguages.get (language, set ()): 603 self._add_macrolanguage (macrolanguage, l) 604 if macrolanguage not in ot.from_bcp_47: 605 for ls in list (self.macrolanguages.values ()): 606 if macrolanguage in ls: 607 ls.add (language) 608 return 609 self.macrolanguages[macrolanguage].add (language) 610 611 def remove_extra_macrolanguages (self): 612 """Make every language have at most one macrolanguage.""" 613 inverted = collections.defaultdict (list) 614 for macrolanguage, languages in self.macrolanguages.items (): 615 for language in languages: 616 inverted[language].append (macrolanguage) 617 for language, macrolanguages in inverted.items (): 618 if len (macrolanguages) > 1: 619 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) 620 biggest_macrolanguage = macrolanguages.pop () 621 for macrolanguage in macrolanguages: 622 self._add_macrolanguage (biggest_macrolanguage, macrolanguage) 623 624 def _get_name_piece (self, subtag): 625 """Return the first name of a subtag plus its scope suffix. 626 627 Args: 628 subtag (str): A BCP 47 subtag. 629 630 Returns: 631 The name form of ``subtag``. 632 """ 633 return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '') 634 635 def get_name (self, lt): 636 """Return the names of the subtags in a language tag. 637 638 Args: 639 lt (LanguageTag): A BCP 47 language tag. 640 641 Returns: 642 The name form of ``lt``. 643 """ 644 name = self._get_name_piece (lt.language) 645 if lt.script: 646 name += '; ' + self._get_name_piece (lt.script.title ()) 647 if lt.region: 648 name += '; ' + self._get_name_piece (lt.region.upper ()) 649 if lt.variant: 650 name += '; ' + self._get_name_piece (lt.variant) 651 return name 652 653bcp_47 = BCP47Parser () 654 655ot.parse (sys.argv[1]) 656bcp_47.parse (sys.argv[2]) 657 658ot.add_language ('ary', 'MOR') 659 660ot.add_language ('ath', 'ATH') 661 662ot.add_language ('bai', 'BML') 663 664ot.ranks['BAL'] = ot.ranks['KAR'] + 1 665 666ot.add_language ('ber', 'BBR') 667 668ot.remove_language_ot ('PGR') 669ot.add_language ('el-polyton', 'PGR') 670 671bcp_47.macrolanguages['et'] = {'ekk'} 672 673bcp_47.names['flm'] = 'Falam Chin' 674bcp_47.scopes['flm'] = ' (retired code)' 675bcp_47.macrolanguages['flm'] = {'cfm'} 676 677ot.ranks['FNE'] = ot.ranks['TNE'] + 1 678 679ot.add_language ('und-fonipa', 'IPPH') 680 681ot.add_language ('und-fonnapa', 'APPH') 682 683ot.remove_language_ot ('IRT') 684ot.add_language ('ga-Latg', 'IRT') 685 686ot.add_language ('hy-arevmda', 'HYE') 687 688ot.remove_language_ot ('KGE') 689ot.add_language ('und-Geok', 'KGE') 690 691bcp_47.macrolanguages['id'] = {'in'} 692 693bcp_47.macrolanguages['ijo'] = {'ijc'} 694 695ot.add_language ('kht', 'KHN') 696ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' 697ot.ranks['KHN'] = ot.ranks['KHT'] + 1 698 699ot.ranks['LCR'] = ot.ranks['MCR'] + 1 700 701ot.names['MAL'] = 'Malayalam Traditional' 702ot.ranks['MLR'] += 1 703 704bcp_47.names['mhv'] = 'Arakanese' 705bcp_47.scopes['mhv'] = ' (retired code)' 706 707ot.add_language ('no', 'NOR') 708 709ot.add_language ('oc-provenc', 'PRO') 710 711ot.add_language ('qu', 'QUZ') 712ot.add_language ('qub', 'QWH') 713ot.add_language ('qud', 'QVI') 714ot.add_language ('qug', 'QVI') 715ot.add_language ('qul', 'QUH') 716ot.add_language ('qup', 'QVI') 717ot.add_language ('qur', 'QWH') 718ot.add_language ('qus', 'QUH') 719ot.add_language ('quw', 'QVI') 720ot.add_language ('qux', 'QWH') 721ot.add_language ('qva', 'QWH') 722ot.add_language ('qvh', 'QWH') 723ot.add_language ('qvj', 'QVI') 724ot.add_language ('qvl', 'QWH') 725ot.add_language ('qvm', 'QWH') 726ot.add_language ('qvn', 'QWH') 727ot.add_language ('qvo', 'QVI') 728ot.add_language ('qvp', 'QWH') 729ot.add_language ('qvw', 'QWH') 730ot.add_language ('qvz', 'QVI') 731ot.add_language ('qwa', 'QWH') 732ot.add_language ('qws', 'QWH') 733ot.add_language ('qxa', 'QWH') 734ot.add_language ('qxc', 'QWH') 735ot.add_language ('qxh', 'QWH') 736ot.add_language ('qxl', 'QVI') 737ot.add_language ('qxn', 'QWH') 738ot.add_language ('qxo', 'QWH') 739ot.add_language ('qxr', 'QVI') 740ot.add_language ('qxt', 'QWH') 741ot.add_language ('qxw', 'QWH') 742 743bcp_47.macrolanguages['ro'].remove ('mo') 744bcp_47.macrolanguages['ro-MD'].add ('mo') 745 746ot.remove_language_ot ('SYRE') 747ot.remove_language_ot ('SYRJ') 748ot.remove_language_ot ('SYRN') 749ot.add_language ('und-Syre', 'SYRE') 750ot.add_language ('und-Syrj', 'SYRJ') 751ot.add_language ('und-Syrn', 'SYRN') 752 753bcp_47.names['xst'] = "Silt'e" 754bcp_47.scopes['xst'] = ' (retired code)' 755bcp_47.macrolanguages['xst'] = {'stv', 'wle'} 756 757ot.add_language ('xwo', 'TOD') 758 759ot.remove_language_ot ('ZHH') 760ot.remove_language_ot ('ZHP') 761ot.remove_language_ot ('ZHT') 762ot.remove_language_ot ('ZHTM') 763bcp_47.macrolanguages['zh'].remove ('lzh') 764bcp_47.macrolanguages['zh'].remove ('yue') 765ot.add_language ('zh-Hant-MO', 'ZHH') 766ot.add_language ('zh-Hant-MO', 'ZHTM') 767ot.add_language ('zh-Hant-HK', 'ZHH') 768ot.add_language ('zh-Hans', 'ZHS') 769ot.add_language ('zh-Hant', 'ZHT') 770ot.add_language ('zh-HK', 'ZHH') 771ot.add_language ('zh-MO', 'ZHH') 772ot.add_language ('zh-MO', 'ZHTM') 773ot.add_language ('zh-TW', 'ZHT') 774ot.add_language ('lzh', 'ZHT') 775ot.add_language ('lzh-Hans', 'ZHS') 776ot.add_language ('yue', 'ZHH') 777ot.add_language ('yue-Hans', 'ZHS') 778 779bcp_47.macrolanguages['zom'] = {'yos'} 780 781def rank_delta (bcp_47, ot): 782 """Return a delta to apply to a BCP 47 tag's rank. 783 784 Most OpenType tags have a constant rank, but a few have ranks that 785 depend on the BCP 47 tag. 786 787 Args: 788 bcp_47 (str): A BCP 47 tag. 789 ot (str): An OpenType tag to. 790 791 Returns: 792 A number to add to ``ot``'s rank when sorting ``bcp_47``'s 793 OpenType equivalents. 794 """ 795 if bcp_47 == 'ak' and ot == 'AKA': 796 return -1 797 if bcp_47 == 'tw' and ot == 'TWI': 798 return -1 799 return 0 800 801disambiguation = { 802 'ALT': 'alt', 803 'ARK': 'rki', 804 'ATH': 'ath', 805 'BHI': 'bhb', 806 'BLN': 'bjt', 807 'BTI': 'beb', 808 'CCHN': 'cco', 809 'CMR': 'swb', 810 'CPP': 'crp', 811 'CRR': 'crx', 812 'DUJ': 'dwu', 813 'ECR': 'crj', 814 'HAL': 'cfm', 815 'HND': 'hnd', 816 'HYE': 'hyw', 817 'KIS': 'kqs', 818 'KUI': 'uki', 819 'LRC': 'bqi', 820 'NDB': 'nd', 821 'NIS': 'njz', 822 'PLG': 'pce', 823 'PRO': 'pro', 824 'QIN': 'bgr', 825 'QUH': 'quh', 826 'QVI': 'qvi', 827 'QWH': 'qwh', 828 'SIG': 'stv', 829 'SRB': 'sr', 830 'SXT': 'xnj', 831 'ZHH': 'zh-HK', 832 'ZHS': 'zh-Hans', 833 'ZHT': 'zh-Hant', 834 'ZHTM': 'zh-MO', 835} 836 837ot.inherit_from_macrolanguages () 838bcp_47.remove_extra_macrolanguages () 839ot.inherit_from_macrolanguages () 840ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/' 841ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1 842for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names): 843 possible_bcp_47_tag = tricky_ot_tag.lower () 844 if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]: 845 ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM) 846 bcp_47.macrolanguages[possible_bcp_47_tag] = set () 847ot.sort_languages () 848 849print ('/* == Start of generated table == */') 850print ('/*') 851print (' * The following table is generated by running:') 852print (' *') 853print (' * %s languagetags language-subtag-registry' % sys.argv[0]) 854print (' *') 855print (' * on files with these headers:') 856print (' *') 857print (' * %s' % ot.header.strip ()) 858print (' * %s' % bcp_47.header) 859print (' */') 860print () 861print ('#ifndef HB_OT_TAG_TABLE_HH') 862print ('#define HB_OT_TAG_TABLE_HH') 863print () 864print ('static const LangTag ot_languages[] = {') 865 866def hb_tag (tag): 867 """Convert a tag to ``HB_TAG`` form. 868 869 Args: 870 tag (str): An OpenType tag. 871 872 Returns: 873 A snippet of C++ representing ``tag``. 874 """ 875 if tag == DEFAULT_LANGUAGE_SYSTEM: 876 return 'HB_TAG_NONE\t ' 877 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) 878 879def get_variant_set (name): 880 """Return a set of variant language names from a name. 881 882 Args: 883 name (str): A list of language names from the BCP 47 registry, 884 joined on ``'\\n'``. 885 886 Returns: 887 A set of normalized language names. 888 """ 889 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'")) 890 .encode ('ASCII', 'ignore') 891 .strip () 892 for n in re.split ('[\n(),]', name) if n) 893 894def language_name_intersection (a, b): 895 """Return the names in common between two language names. 896 897 Args: 898 a (str): A list of language names from the BCP 47 registry, 899 joined on ``'\\n'``. 900 b (str): A list of language names from the BCP 47 registry, 901 joined on ``'\\n'``. 902 903 Returns: 904 The normalized language names shared by ``a`` and ``b``. 905 """ 906 return get_variant_set (a).intersection (get_variant_set (b)) 907 908def get_matching_language_name (intersection, candidates): 909 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) 910 911def same_tag (bcp_47_tag, ot_tags): 912 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () 913 914for language, tags in sorted (ot.from_bcp_47.items ()): 915 if language == '' or '-' in language: 916 continue 917 commented_out = same_tag (language, tags) 918 for i, tag in enumerate (tags, start=1): 919 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='') 920 if commented_out: 921 print ('*/', end='') 922 print ('\t/* ', end='') 923 bcp_47_name = bcp_47.names.get (language, '') 924 bcp_47_name_candidates = bcp_47_name.split ('\n') 925 ot_name = ot.names[tag] 926 scope = bcp_47.scopes.get (language, '') 927 if tag == DEFAULT_LANGUAGE_SYSTEM: 928 write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') 929 else: 930 intersection = language_name_intersection (bcp_47_name, ot_name) 931 if not intersection: 932 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name)) 933 else: 934 name = get_matching_language_name (intersection, bcp_47_name_candidates) 935 bcp_47.names[language] = name 936 write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) 937 print (' */') 938 939print ('};') 940print () 941 942print ('/**') 943print (' * hb_ot_tags_from_complex_language:') 944print (' * @lang_str: a BCP 47 language tag to convert.') 945print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') 946print (' * conversion.') 947print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') 948print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') 949print (' * @tags: array of size at least @language_count to store the language tag') 950print (' * results') 951print (' *') 952print (' * Converts a multi-subtag BCP 47 language tag to language tags.') 953print (' *') 954print (' * Return value: Whether any language systems were retrieved.') 955print (' **/') 956print ('static bool') 957print ('hb_ot_tags_from_complex_language (const char *lang_str,') 958print ('\t\t\t\t const char *limit,') 959print ('\t\t\t\t unsigned int *count /* IN/OUT */,') 960print ('\t\t\t\t hb_tag_t *tags /* OUT */)') 961print ('{') 962 963def print_subtag_matches (subtag, new_line): 964 if subtag: 965 if new_line: 966 print () 967 print ('\t&& ', end='') 968 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='') 969 970complex_tags = collections.defaultdict (list) 971for initial, group in itertools.groupby ((lt_tags for lt_tags in [ 972 (LanguageTag (language), tags) 973 for language, tags in sorted (ot.from_bcp_47.items (), 974 key=lambda i: (-len (i[0]), i[0])) 975 ] if lt_tags[0].is_complex ()), 976 key=lambda lt_tags: lt_tags[0].get_group ()): 977 complex_tags[initial] += group 978 979for initial, items in sorted (complex_tags.items ()): 980 if initial != 'und': 981 continue 982 for lt, tags in items: 983 if lt.variant in bcp_47.prefixes: 984 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, 985 '%s is not a valid prefix of %s' % (lt.language, lt.variant)) 986 print (' if (', end='') 987 print_subtag_matches (lt.script, False) 988 print_subtag_matches (lt.region, False) 989 print_subtag_matches (lt.variant, False) 990 print (')') 991 print (' {') 992 write (' /* %s */' % bcp_47.get_name (lt)) 993 print () 994 if len (tags) == 1: 995 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 996 print () 997 print (' *count = 1;') 998 else: 999 print (' hb_tag_t possible_tags[] = {') 1000 for tag in tags: 1001 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1002 print () 1003 print (' };') 1004 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1005 print (' tags[i] = possible_tags[i];') 1006 print (' *count = i;') 1007 print (' return true;') 1008 print (' }') 1009 1010print (' switch (lang_str[0])') 1011print (' {') 1012for initial, items in sorted (complex_tags.items ()): 1013 if initial == 'und': 1014 continue 1015 print (" case '%s':" % initial) 1016 for lt, tags in items: 1017 print (' if (', end='') 1018 script = lt.script 1019 region = lt.region 1020 if lt.grandfathered: 1021 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') 1022 else: 1023 string_literal = lt.language[1:] + '-' 1024 if script: 1025 string_literal += script 1026 script = None 1027 if region: 1028 string_literal += '-' + region 1029 region = None 1030 if string_literal[-1] == '-': 1031 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') 1032 else: 1033 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='') 1034 print_subtag_matches (script, True) 1035 print_subtag_matches (region, True) 1036 print_subtag_matches (lt.variant, True) 1037 print (')') 1038 print (' {') 1039 write (' /* %s */' % bcp_47.get_name (lt)) 1040 print () 1041 if len (tags) == 1: 1042 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1043 print () 1044 print (' *count = 1;') 1045 else: 1046 print (' unsigned int i;') 1047 print (' hb_tag_t possible_tags[] = {') 1048 for tag in tags: 1049 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1050 print () 1051 print (' };') 1052 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1053 print ('\ttags[i] = possible_tags[i];') 1054 print (' *count = i;') 1055 print (' return true;') 1056 print (' }') 1057 print (' break;') 1058 1059print (' }') 1060print (' return false;') 1061print ('}') 1062print () 1063print ('/**') 1064print (' * hb_ot_ambiguous_tag_to_language') 1065print (' * @tag: A language tag.') 1066print (' *') 1067print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') 1068print (' * many language tags) and the best tag is not the alphabetically first, or if') 1069print (' * the best tag consists of multiple subtags, or if the best tag does not appear') 1070print (' * in #ot_languages.') 1071print (' *') 1072print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') 1073print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') 1074print (' **/') 1075print ('static hb_language_t') 1076print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') 1077print ('{') 1078print (' switch (tag)') 1079print (' {') 1080 1081def verify_disambiguation_dict (): 1082 """Verify and normalize ``disambiguation``. 1083 1084 ``disambiguation`` is a map of ambiguous OpenType language system 1085 tags to the particular BCP 47 tags they correspond to. This function 1086 checks that all its keys really are ambiguous and that each key's 1087 value is valid for that key. It checks that no ambiguous tag is 1088 missing, except when it can figure out which BCP 47 tag is the best 1089 by itself. 1090 1091 It modifies ``disambiguation`` to remove keys whose values are the 1092 same as those that the fallback would return anyway, and to add 1093 ambiguous keys whose disambiguations it determined automatically. 1094 1095 Raises: 1096 AssertionError: Verification failed. 1097 """ 1098 global bcp_47 1099 global disambiguation 1100 global ot 1101 for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): 1102 if ot_tag == DEFAULT_LANGUAGE_SYSTEM: 1103 primary_tags = [] 1104 else: 1105 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) 1106 if len (primary_tags) == 1: 1107 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) 1108 if '-' in primary_tags[0]: 1109 disambiguation[ot_tag] = primary_tags[0] 1110 else: 1111 first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0] 1112 if primary_tags[0] != first_tag: 1113 disambiguation[ot_tag] = primary_tags[0] 1114 elif len (primary_tags) == 0: 1115 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) 1116 else: 1117 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') 1118 if len (macrolanguages) != 1: 1119 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') 1120 if len (macrolanguages) != 1: 1121 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) 1122 if len (macrolanguages) != 1: 1123 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) 1124 expect (disambiguation[ot_tag] in bcp_47_tags, 1125 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) 1126 elif ot_tag not in disambiguation: 1127 disambiguation[ot_tag] = macrolanguages[0] 1128 different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))) 1129 if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]: 1130 del disambiguation[ot_tag] 1131 for ot_tag in disambiguation.keys (): 1132 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) 1133 1134verify_disambiguation_dict () 1135for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): 1136 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) 1137 print () 1138 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) 1139 print () 1140 1141print (' default:') 1142print (' return HB_LANGUAGE_INVALID;') 1143print (' }') 1144print ('}') 1145 1146print () 1147print ('#endif /* HB_OT_TAG_TABLE_HH */') 1148print () 1149print ('/* == End of generated table == */') 1150 1151