1#!/usr/bin/env python3 2 3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice 4versa. 5 6It creates a ``const LangTag[]``, matching the tags from the OpenType 7languages system tag list to the language subtags of the BCP 47 language 8subtag registry, with some manual adjustments. The mappings are 9supplemented with macrolanguages' sublanguages and retired codes' 10replacements, according to BCP 47 and some manual additions where BCP 47 11omits a retired code entirely. 12 13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, 14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags 15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to 16multiple BCP 47 tags) are listed here, except when the alphabetically 17first BCP 47 tag happens to be the chosen disambiguated tag. In that 18case, the fallback behavior will choose the right tag anyway. 19 20usage: ./gen-tag-table.py languagetags language-subtag-registry 21 22Input files: 23* https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags 24* https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 25""" 26 27import collections 28import html 29from html.parser import HTMLParser 30import itertools 31import re 32import sys 33import unicodedata 34 35if len (sys.argv) != 3: 36 sys.exit (__doc__) 37 38def expect (condition, message=None): 39 if not condition: 40 if message is None: 41 raise AssertionError 42 raise AssertionError (message) 43 44def write (s): 45 sys.stdout.flush () 46 sys.stdout.buffer.write (s.encode ('utf-8')) 47 48DEFAULT_LANGUAGE_SYSTEM = '' 49 50# from https://www-01.sil.org/iso639-3/iso-639-3.tab 51ISO_639_3_TO_1 = { 52 'aar': 'aa', 53 'abk': 'ab', 54 'afr': 'af', 55 'aka': 'ak', 56 'amh': 'am', 57 'ara': 'ar', 58 'arg': 'an', 59 'asm': 'as', 60 'ava': 'av', 61 'ave': 'ae', 62 'aym': 'ay', 63 'aze': 'az', 64 'bak': 'ba', 65 'bam': 'bm', 66 'bel': 'be', 67 'ben': 'bn', 68 'bis': 'bi', 69 'bod': 'bo', 70 'bos': 'bs', 71 'bre': 'br', 72 'bul': 'bg', 73 'cat': 'ca', 74 'ces': 'cs', 75 'cha': 'ch', 76 'che': 'ce', 77 'chu': 'cu', 78 'chv': 'cv', 79 'cor': 'kw', 80 'cos': 'co', 81 'cre': 'cr', 82 'cym': 'cy', 83 'dan': 'da', 84 'deu': 'de', 85 'div': 'dv', 86 'dzo': 'dz', 87 'ell': 'el', 88 'eng': 'en', 89 'epo': 'eo', 90 'est': 'et', 91 'eus': 'eu', 92 'ewe': 'ee', 93 'fao': 'fo', 94 'fas': 'fa', 95 'fij': 'fj', 96 'fin': 'fi', 97 'fra': 'fr', 98 'fry': 'fy', 99 'ful': 'ff', 100 'gla': 'gd', 101 'gle': 'ga', 102 'glg': 'gl', 103 'glv': 'gv', 104 'grn': 'gn', 105 'guj': 'gu', 106 'hat': 'ht', 107 'hau': 'ha', 108 'hbs': 'sh', 109 'heb': 'he', 110 'her': 'hz', 111 'hin': 'hi', 112 'hmo': 'ho', 113 'hrv': 'hr', 114 'hun': 'hu', 115 'hye': 'hy', 116 'ibo': 'ig', 117 'ido': 'io', 118 'iii': 'ii', 119 'iku': 'iu', 120 'ile': 'ie', 121 'ina': 'ia', 122 'ind': 'id', 123 'ipk': 'ik', 124 'isl': 'is', 125 'ita': 'it', 126 'jav': 'jv', 127 'jpn': 'ja', 128 'kal': 'kl', 129 'kan': 'kn', 130 'kas': 'ks', 131 'kat': 'ka', 132 'kau': 'kr', 133 'kaz': 'kk', 134 'khm': 'km', 135 'kik': 'ki', 136 'kin': 'rw', 137 'kir': 'ky', 138 'kom': 'kv', 139 'kon': 'kg', 140 'kor': 'ko', 141 'kua': 'kj', 142 'kur': 'ku', 143 'lao': 'lo', 144 'lat': 'la', 145 'lav': 'lv', 146 'lim': 'li', 147 'lin': 'ln', 148 'lit': 'lt', 149 'ltz': 'lb', 150 'lub': 'lu', 151 'lug': 'lg', 152 'mah': 'mh', 153 'mal': 'ml', 154 'mar': 'mr', 155 'mkd': 'mk', 156 'mlg': 'mg', 157 'mlt': 'mt', 158 'mol': 'mo', 159 'mon': 'mn', 160 'mri': 'mi', 161 'msa': 'ms', 162 'mya': 'my', 163 'nau': 'na', 164 'nav': 'nv', 165 'nbl': 'nr', 166 'nde': 'nd', 167 'ndo': 'ng', 168 'nep': 'ne', 169 'nld': 'nl', 170 'nno': 'nn', 171 'nob': 'nb', 172 'nor': 'no', 173 'nya': 'ny', 174 'oci': 'oc', 175 'oji': 'oj', 176 'ori': 'or', 177 'orm': 'om', 178 'oss': 'os', 179 'pan': 'pa', 180 'pli': 'pi', 181 'pol': 'pl', 182 'por': 'pt', 183 'pus': 'ps', 184 'que': 'qu', 185 'roh': 'rm', 186 'ron': 'ro', 187 'run': 'rn', 188 'rus': 'ru', 189 'sag': 'sg', 190 'san': 'sa', 191 'sin': 'si', 192 'slk': 'sk', 193 'slv': 'sl', 194 'sme': 'se', 195 'smo': 'sm', 196 'sna': 'sn', 197 'snd': 'sd', 198 'som': 'so', 199 'sot': 'st', 200 'spa': 'es', 201 'sqi': 'sq', 202 'srd': 'sc', 203 'srp': 'sr', 204 'ssw': 'ss', 205 'sun': 'su', 206 'swa': 'sw', 207 'swe': 'sv', 208 'tah': 'ty', 209 'tam': 'ta', 210 'tat': 'tt', 211 'tel': 'te', 212 'tgk': 'tg', 213 'tgl': 'tl', 214 'tha': 'th', 215 'tir': 'ti', 216 'ton': 'to', 217 'tsn': 'tn', 218 'tso': 'ts', 219 'tuk': 'tk', 220 'tur': 'tr', 221 'twi': 'tw', 222 'uig': 'ug', 223 'ukr': 'uk', 224 'urd': 'ur', 225 'uzb': 'uz', 226 'ven': 've', 227 'vie': 'vi', 228 'vol': 'vo', 229 'wln': 'wa', 230 'wol': 'wo', 231 'xho': 'xh', 232 'yid': 'yi', 233 'yor': 'yo', 234 'zha': 'za', 235 'zho': 'zh', 236 'zul': 'zu', 237} 238 239class LanguageTag (object): 240 """A BCP 47 language tag. 241 242 Attributes: 243 subtags (List[str]): The list of subtags in this tag. 244 grandfathered (bool): Whether this tag is grandfathered. If 245 ``true``, the entire lowercased tag is the ``language`` 246 and the other subtag fields are empty. 247 language (str): The language subtag. 248 script (str): The script subtag. 249 region (str): The region subtag. 250 variant (str): The variant subtag. 251 252 Args: 253 tag (str): A BCP 47 language tag. 254 255 """ 256 def __init__ (self, tag): 257 global bcp_47 258 self.subtags = tag.lower ().split ('-') 259 self.grandfathered = tag.lower () in bcp_47.grandfathered 260 if self.grandfathered: 261 self.language = tag.lower () 262 self.script = '' 263 self.region = '' 264 self.variant = '' 265 else: 266 self.language = self.subtags[0] 267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) 268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) 269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) 270 271 def __str__(self): 272 return '-'.join(self.subtags) 273 274 def __repr__ (self): 275 return 'LanguageTag(%r)' % str(self) 276 277 @staticmethod 278 def _find_first (function, sequence): 279 try: 280 return next (iter (filter (function, sequence))) 281 except StopIteration: 282 return None 283 284 def is_complex (self): 285 """Return whether this tag is too complex to represent as a 286 ``LangTag`` in the generated code. 287 288 Complex tags need to be handled in 289 ``hb_ot_tags_from_complex_language``. 290 291 Returns: 292 Whether this tag is complex. 293 """ 294 return not (len (self.subtags) == 1 295 or self.grandfathered 296 and len (self.subtags[1]) != 3 297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) 298 299 def get_group (self): 300 """Return the group into which this tag should be categorized in 301 ``hb_ot_tags_from_complex_language``. 302 303 The group is the first letter of the tag, or ``'und'`` if this tag 304 should not be matched in a ``switch`` statement in the generated 305 code. 306 307 Returns: 308 This tag's group. 309 """ 310 return ('und' 311 if (self.language == 'und' 312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) 313 else self.language[0]) 314 315class OpenTypeRegistryParser (HTMLParser): 316 """A parser for the OpenType language system tag registry. 317 318 Attributes: 319 header (str): The "last updated" line of the registry. 320 names (Mapping[str, str]): A map of language system tags to the 321 names they are given in the registry. 322 ranks (DefaultDict[str, int]): A map of language system tags to 323 numbers. If a single BCP 47 tag corresponds to multiple 324 OpenType tags, the tags are ordered in increasing order by 325 rank. The rank is based on the number of BCP 47 tags 326 associated with a tag, though it may be manually modified. 327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of 328 OpenType language system tags to sets of BCP 47 tags. 329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` 330 inverted. Its values start as unsorted sets; 331 ``sort_languages`` converts them to sorted lists. 332 from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): 333 A copy of ``from_bcp_47``. It starts as ``None`` and is 334 populated at the beginning of the first call to 335 ``inherit_from_macrolanguages``. 336 337 """ 338 def __init__ (self): 339 HTMLParser.__init__ (self) 340 self.header = '' 341 self.names = {} 342 self.ranks = collections.defaultdict (int) 343 self.to_bcp_47 = collections.defaultdict (set) 344 self.from_bcp_47 = collections.defaultdict (set) 345 self.from_bcp_47_uninherited = None 346 # Whether the parser is in a <td> element 347 self._td = False 348 # Whether the parser is after a <br> element within the current <tr> element 349 self._br = False 350 # The text of the <td> elements of the current <tr> element. 351 self._current_tr = [] 352 353 def handle_starttag (self, tag, attrs): 354 if tag == 'br': 355 self._br = True 356 elif tag == 'meta': 357 for attr, value in attrs: 358 if attr == 'name' and value == 'updated_at': 359 self.header = self.get_starttag_text () 360 break 361 elif tag == 'td': 362 self._td = True 363 self._current_tr.append ('') 364 elif tag == 'tr': 365 self._br = False 366 self._current_tr = [] 367 368 def handle_endtag (self, tag): 369 if tag == 'td': 370 self._td = False 371 elif tag == 'tr' and self._current_tr: 372 expect (2 <= len (self._current_tr) <= 3) 373 name = self._current_tr[0].strip () 374 tag = self._current_tr[1].strip ("\t\n\v\f\r '") 375 rank = 0 376 if len (tag) > 4: 377 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) 378 name += ' (deprecated)' 379 tag = tag.split (' ')[0] 380 rank = 1 381 self.names[tag] = re.sub (' languages$', '', name) 382 if not self._current_tr[2]: 383 return 384 iso_codes = self._current_tr[2].strip () 385 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) 386 rank += 2 * len (self.to_bcp_47[tag]) 387 self.ranks[tag] = rank 388 389 def handle_data (self, data): 390 if self._td and not self._br: 391 self._current_tr[-1] += data 392 393 def handle_charref (self, name): 394 self.handle_data (html.unescape ('&#%s;' % name)) 395 396 def handle_entityref (self, name): 397 self.handle_data (html.unescape ('&%s;' % name)) 398 399 def parse (self, filename): 400 """Parse the OpenType language system tag registry. 401 402 Args: 403 filename (str): The file name of the registry. 404 """ 405 with open (filename, encoding='utf-8') as f: 406 self.feed (f.read ()) 407 expect (self.header) 408 for tag, iso_codes in self.to_bcp_47.items (): 409 for iso_code in iso_codes: 410 self.from_bcp_47[iso_code].add (tag) 411 412 def add_language (self, bcp_47_tag, ot_tag): 413 """Add a language as if it were in the registry. 414 415 Args: 416 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just 417 a language subtag, and if the language subtag is a 418 macrolanguage, then new languages are added corresponding 419 to the macrolanguages' individual languages with the 420 remainder of the tag appended. 421 ot_tag (str): An OpenType language system tag. 422 """ 423 global bcp_47 424 self.to_bcp_47[ot_tag].add (bcp_47_tag) 425 self.from_bcp_47[bcp_47_tag].add (ot_tag) 426 if bcp_47_tag.lower () not in bcp_47.grandfathered: 427 try: 428 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) 429 if macrolanguage in bcp_47.macrolanguages: 430 s = set () 431 for language in bcp_47.macrolanguages[macrolanguage]: 432 if language.lower () not in bcp_47.grandfathered: 433 s.add ('%s-%s' % (language, suffix)) 434 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s 435 except ValueError: 436 pass 437 438 @staticmethod 439 def _remove_language (tag_1, dict_1, dict_2): 440 for tag_2 in dict_1.pop (tag_1): 441 dict_2[tag_2].remove (tag_1) 442 if not dict_2[tag_2]: 443 del dict_2[tag_2] 444 445 def remove_language_ot (self, ot_tag): 446 """Remove an OpenType tag from the registry. 447 448 Args: 449 ot_tag (str): An OpenType tag. 450 """ 451 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) 452 453 def remove_language_bcp_47 (self, bcp_47_tag): 454 """Remove a BCP 47 tag from the registry. 455 456 Args: 457 bcp_47_tag (str): A BCP 47 tag. 458 """ 459 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) 460 461 def inherit_from_macrolanguages (self): 462 """Copy mappings from macrolanguages to individual languages. 463 464 If a BCP 47 tag for an individual mapping has no OpenType 465 mapping but its macrolanguage does, the mapping is copied to 466 the individual language. For example, als (Tosk Albanian) has no 467 explicit mapping, so it inherits from sq (Albanian) the mapping 468 to SQI. 469 470 However, if an OpenType tag maps to a BCP 47 macrolanguage and 471 some but not all of its individual languages, the mapping is not 472 inherited from the macrolanguage to the missing individual 473 languages. For example, INUK (Nunavik Inuktitut) is mapped to 474 ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to 475 ikt (Inuinnaqtun, which is an individual language of iu), so 476 this method does not add a mapping from ikt to INUK. 477 478 If a BCP 47 tag for a macrolanguage has no OpenType mapping but 479 some of its individual languages do, their mappings are copied 480 to the macrolanguage. 481 """ 482 global bcp_47 483 first_time = self.from_bcp_47_uninherited is None 484 if first_time: 485 self.from_bcp_47_uninherited = dict (self.from_bcp_47) 486 for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): 487 ot_macrolanguages = { 488 ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) 489 } 490 blocked_ot_macrolanguages = set () 491 if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): 492 for ot_macrolanguage in ot_macrolanguages: 493 round_trip_macrolanguages = { 494 l for l in self.to_bcp_47[ot_macrolanguage] 495 if 'retired code' not in bcp_47.scopes.get (l, '') 496 } 497 round_trip_languages = { 498 l for l in languages 499 if 'retired code' not in bcp_47.scopes.get (l, '') 500 } 501 intersection = round_trip_macrolanguages & round_trip_languages 502 if intersection and intersection != round_trip_languages: 503 blocked_ot_macrolanguages.add (ot_macrolanguage) 504 if ot_macrolanguages: 505 for ot_macrolanguage in ot_macrolanguages: 506 if ot_macrolanguage not in blocked_ot_macrolanguages: 507 for language in languages: 508 self.add_language (language, ot_macrolanguage) 509 if not blocked_ot_macrolanguages: 510 self.ranks[ot_macrolanguage] += 1 511 elif first_time: 512 for language in languages: 513 if language in self.from_bcp_47_uninherited: 514 ot_macrolanguages |= self.from_bcp_47_uninherited[language] 515 else: 516 ot_macrolanguages.clear () 517 if not ot_macrolanguages: 518 break 519 for ot_macrolanguage in ot_macrolanguages: 520 self.add_language (macrolanguage, ot_macrolanguage) 521 522 def sort_languages (self): 523 """Sort the values of ``from_bcp_47`` in ascending rank order.""" 524 for language, tags in self.from_bcp_47.items (): 525 self.from_bcp_47[language] = sorted (tags, 526 key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) 527 528ot = OpenTypeRegistryParser () 529 530class BCP47Parser (object): 531 """A parser for the BCP 47 subtag registry. 532 533 Attributes: 534 header (str): The "File-Date" line of the registry. 535 names (Mapping[str, str]): A map of subtags to the names they 536 are given in the registry. Each value is a 537 ``'\\n'``-separated list of names. 538 scopes (Mapping[str, str]): A map of language subtags to strings 539 suffixed to language names, including suffixes to explain 540 language scopes. 541 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of 542 language subtags to the sets of language subtags which 543 inherit from them. See 544 ``OpenTypeRegistryParser.inherit_from_macrolanguages``. 545 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant 546 subtags to their prefixes. 547 grandfathered (AbstractSet[str]): The set of grandfathered tags, 548 normalized to lowercase. 549 550 """ 551 def __init__ (self): 552 self.header = '' 553 self.names = {} 554 self.scopes = {} 555 self.macrolanguages = collections.defaultdict (set) 556 self.prefixes = collections.defaultdict (set) 557 self.grandfathered = set () 558 559 def parse (self, filename): 560 """Parse the BCP 47 subtag registry. 561 562 Args: 563 filename (str): The file name of the registry. 564 """ 565 with open (filename, encoding='utf-8') as f: 566 subtag_type = None 567 subtag = None 568 deprecated = False 569 has_preferred_value = False 570 line_buffer = '' 571 for line in itertools.chain (f, ['']): 572 line = line.rstrip () 573 if line.startswith (' '): 574 line_buffer += line[1:] 575 continue 576 line, line_buffer = line_buffer, line 577 if line.startswith ('Type: '): 578 subtag_type = line.split (' ')[1] 579 deprecated = False 580 has_preferred_value = False 581 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): 582 subtag = line.split (' ')[1] 583 if subtag_type == 'grandfathered': 584 self.grandfathered.add (subtag.lower ()) 585 elif line.startswith ('Description: '): 586 description = line.split (' ', 1)[1].replace (' (individual language)', '') 587 description = re.sub (' (\(family\)|\((individual |macro)language\)|languages)$', '', 588 description) 589 if subtag in self.names: 590 self.names[subtag] += '\n' + description 591 else: 592 self.names[subtag] = description 593 elif subtag_type == 'language' or subtag_type == 'grandfathered': 594 if line.startswith ('Scope: '): 595 scope = line.split (' ')[1] 596 if scope == 'macrolanguage': 597 scope = ' [macrolanguage]' 598 elif scope == 'collection': 599 scope = ' [collection]' 600 else: 601 continue 602 self.scopes[subtag] = scope 603 elif line.startswith ('Deprecated: '): 604 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 605 deprecated = True 606 elif deprecated and line.startswith ('Comments: see '): 607 # If a subtag is split into multiple replacement subtags, 608 # it essentially represents a macrolanguage. 609 for language in line.replace (',', '').split (' ')[2:]: 610 self._add_macrolanguage (subtag, language) 611 elif line.startswith ('Preferred-Value: '): 612 # If a subtag is deprecated in favor of a single replacement subtag, 613 # it is either a dialect or synonym of the preferred subtag. Either 614 # way, it is close enough to the truth to consider the replacement 615 # the macrolanguage of the deprecated language. 616 has_preferred_value = True 617 macrolanguage = line.split (' ')[1] 618 self._add_macrolanguage (macrolanguage, subtag) 619 elif not has_preferred_value and line.startswith ('Macrolanguage: '): 620 self._add_macrolanguage (line.split (' ')[1], subtag) 621 elif subtag_type == 'variant': 622 if line.startswith ('Deprecated: '): 623 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 624 elif line.startswith ('Prefix: '): 625 self.prefixes[subtag].add (line.split (' ')[1]) 626 elif line.startswith ('File-Date: '): 627 self.header = line 628 expect (self.header) 629 630 def _add_macrolanguage (self, macrolanguage, language): 631 global ot 632 if language not in ot.from_bcp_47: 633 for l in self.macrolanguages.get (language, set ()): 634 self._add_macrolanguage (macrolanguage, l) 635 if macrolanguage not in ot.from_bcp_47: 636 for ls in list (self.macrolanguages.values ()): 637 if macrolanguage in ls: 638 ls.add (language) 639 return 640 self.macrolanguages[macrolanguage].add (language) 641 642 def remove_extra_macrolanguages (self): 643 """Make every language have at most one macrolanguage.""" 644 inverted = collections.defaultdict (list) 645 for macrolanguage, languages in self.macrolanguages.items (): 646 for language in languages: 647 inverted[language].append (macrolanguage) 648 for language, macrolanguages in inverted.items (): 649 if len (macrolanguages) > 1: 650 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) 651 biggest_macrolanguage = macrolanguages.pop () 652 for macrolanguage in macrolanguages: 653 self._add_macrolanguage (biggest_macrolanguage, macrolanguage) 654 655 def _get_name_piece (self, subtag): 656 """Return the first name of a subtag plus its scope suffix. 657 658 Args: 659 subtag (str): A BCP 47 subtag. 660 661 Returns: 662 The name form of ``subtag``. 663 """ 664 return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '') 665 666 def get_name (self, lt): 667 """Return the names of the subtags in a language tag. 668 669 Args: 670 lt (LanguageTag): A BCP 47 language tag. 671 672 Returns: 673 The name form of ``lt``. 674 """ 675 name = self._get_name_piece (lt.language) 676 if lt.script: 677 name += '; ' + self._get_name_piece (lt.script.title ()) 678 if lt.region: 679 name += '; ' + self._get_name_piece (lt.region.upper ()) 680 if lt.variant: 681 name += '; ' + self._get_name_piece (lt.variant) 682 return name 683 684bcp_47 = BCP47Parser () 685 686ot.parse (sys.argv[1]) 687bcp_47.parse (sys.argv[2]) 688 689ot.add_language ('ary', 'MOR') 690 691ot.add_language ('ath', 'ATH') 692 693ot.add_language ('bai', 'BML') 694 695ot.ranks['BAL'] = ot.ranks['KAR'] + 1 696 697ot.add_language ('ber', 'BBR') 698 699ot.remove_language_ot ('PGR') 700ot.add_language ('el-polyton', 'PGR') 701 702bcp_47.macrolanguages['et'] = {'ekk'} 703 704bcp_47.names['flm'] = 'Falam Chin' 705bcp_47.scopes['flm'] = ' (retired code)' 706bcp_47.macrolanguages['flm'] = {'cfm'} 707 708ot.ranks['FNE'] = ot.ranks['TNE'] + 1 709 710ot.add_language ('und-fonipa', 'IPPH') 711 712ot.add_language ('und-fonnapa', 'APPH') 713 714ot.remove_language_ot ('IRT') 715ot.add_language ('ga-Latg', 'IRT') 716 717ot.add_language ('hy-arevmda', 'HYE') 718 719ot.remove_language_ot ('KGE') 720ot.add_language ('und-Geok', 'KGE') 721 722bcp_47.macrolanguages['id'] = {'in'} 723 724bcp_47.macrolanguages['ijo'] = {'ijc'} 725 726ot.add_language ('kht', 'KHN') 727ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' 728ot.ranks['KHN'] = ot.ranks['KHT'] + 1 729 730ot.ranks['LCR'] = ot.ranks['MCR'] + 1 731 732ot.names['MAL'] = 'Malayalam Traditional' 733ot.ranks['MLR'] += 1 734 735bcp_47.names['mhv'] = 'Arakanese' 736bcp_47.scopes['mhv'] = ' (retired code)' 737 738ot.add_language ('mnw-TH', 'MONT') 739 740ot.add_language ('no', 'NOR') 741 742ot.add_language ('oc-provenc', 'PRO') 743 744ot.remove_language_ot ('QUZ') 745ot.add_language ('qu', 'QUZ') 746ot.add_language ('qub', 'QWH') 747ot.add_language ('qud', 'QVI') 748ot.add_language ('qug', 'QVI') 749ot.add_language ('qul', 'QUH') 750ot.add_language ('qup', 'QVI') 751ot.add_language ('qur', 'QWH') 752ot.add_language ('qus', 'QUH') 753ot.add_language ('quw', 'QVI') 754ot.add_language ('qux', 'QWH') 755ot.add_language ('qva', 'QWH') 756ot.add_language ('qvh', 'QWH') 757ot.add_language ('qvj', 'QVI') 758ot.add_language ('qvl', 'QWH') 759ot.add_language ('qvm', 'QWH') 760ot.add_language ('qvn', 'QWH') 761ot.add_language ('qvo', 'QVI') 762ot.add_language ('qvp', 'QWH') 763ot.add_language ('qvw', 'QWH') 764ot.add_language ('qvz', 'QVI') 765ot.add_language ('qwa', 'QWH') 766ot.add_language ('qws', 'QWH') 767ot.add_language ('qxa', 'QWH') 768ot.add_language ('qxc', 'QWH') 769ot.add_language ('qxh', 'QWH') 770ot.add_language ('qxl', 'QVI') 771ot.add_language ('qxn', 'QWH') 772ot.add_language ('qxo', 'QWH') 773ot.add_language ('qxr', 'QVI') 774ot.add_language ('qxt', 'QWH') 775ot.add_language ('qxw', 'QWH') 776 777bcp_47.macrolanguages['ro-MD'].add ('mo') 778 779ot.remove_language_ot ('SYRE') 780ot.remove_language_ot ('SYRJ') 781ot.remove_language_ot ('SYRN') 782ot.add_language ('und-Syre', 'SYRE') 783ot.add_language ('und-Syrj', 'SYRJ') 784ot.add_language ('und-Syrn', 'SYRN') 785 786bcp_47.names['xst'] = "Silt'e" 787bcp_47.scopes['xst'] = ' (retired code)' 788bcp_47.macrolanguages['xst'] = {'stv', 'wle'} 789 790ot.add_language ('xwo', 'TOD') 791 792ot.remove_language_ot ('ZHH') 793ot.remove_language_ot ('ZHP') 794ot.remove_language_ot ('ZHT') 795ot.remove_language_ot ('ZHTM') 796bcp_47.macrolanguages['zh'].remove ('lzh') 797bcp_47.macrolanguages['zh'].remove ('yue') 798ot.add_language ('zh-Hant-MO', 'ZHH') 799ot.add_language ('zh-Hant-MO', 'ZHTM') 800ot.add_language ('zh-Hant-HK', 'ZHH') 801ot.add_language ('zh-Hans', 'ZHS') 802ot.add_language ('zh-Hant', 'ZHT') 803ot.add_language ('zh-HK', 'ZHH') 804ot.add_language ('zh-MO', 'ZHH') 805ot.add_language ('zh-MO', 'ZHTM') 806ot.add_language ('zh-TW', 'ZHT') 807ot.add_language ('lzh', 'ZHT') 808ot.add_language ('lzh-Hans', 'ZHS') 809ot.add_language ('yue', 'ZHH') 810ot.add_language ('yue-Hans', 'ZHS') 811 812bcp_47.macrolanguages['zom'] = {'yos'} 813 814def rank_delta (bcp_47, ot): 815 """Return a delta to apply to a BCP 47 tag's rank. 816 817 Most OpenType tags have a constant rank, but a few have ranks that 818 depend on the BCP 47 tag. 819 820 Args: 821 bcp_47 (str): A BCP 47 tag. 822 ot (str): An OpenType tag to. 823 824 Returns: 825 A number to add to ``ot``'s rank when sorting ``bcp_47``'s 826 OpenType equivalents. 827 """ 828 if bcp_47 == 'ak' and ot == 'AKA': 829 return -1 830 if bcp_47 == 'tw' and ot == 'TWI': 831 return -1 832 return 0 833 834disambiguation = { 835 'ALT': 'alt', 836 'ARK': 'rki', 837 'ATH': 'ath', 838 'BHI': 'bhb', 839 'BLN': 'bjt', 840 'BTI': 'beb', 841 'CCHN': 'cco', 842 'CMR': 'swb', 843 'CPP': 'crp', 844 'CRR': 'crx', 845 'DUJ': 'dwu', 846 'ECR': 'crj', 847 'HAL': 'cfm', 848 'HND': 'hnd', 849 'HYE': 'hyw', 850 'KIS': 'kqs', 851 'KUI': 'uki', 852 'LRC': 'bqi', 853 'NDB': 'nd', 854 'NIS': 'njz', 855 'PLG': 'pce', 856 'PRO': 'pro', 857 'QIN': 'bgr', 858 'QUH': 'quh', 859 'QVI': 'qvi', 860 'QWH': 'qwh', 861 'SIG': 'stv', 862 'SRB': 'sr', 863 'SXT': 'xnj', 864 'ZHH': 'zh-HK', 865 'ZHS': 'zh-Hans', 866 'ZHT': 'zh-Hant', 867 'ZHTM': 'zh-MO', 868} 869 870ot.inherit_from_macrolanguages () 871bcp_47.remove_extra_macrolanguages () 872ot.inherit_from_macrolanguages () 873ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/' 874ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1 875for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names): 876 possible_bcp_47_tag = tricky_ot_tag.lower () 877 if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]: 878 ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM) 879 bcp_47.macrolanguages[possible_bcp_47_tag] = set () 880ot.sort_languages () 881 882print ('/* == Start of generated table == */') 883print ('/*') 884print (' * The following table is generated by running:') 885print (' *') 886print (' * %s languagetags language-subtag-registry' % sys.argv[0]) 887print (' *') 888print (' * on files with these headers:') 889print (' *') 890print (' * %s' % ot.header.strip ()) 891print (' * %s' % bcp_47.header) 892print (' */') 893print () 894print ('#ifndef HB_OT_TAG_TABLE_HH') 895print ('#define HB_OT_TAG_TABLE_HH') 896print () 897 898def hb_tag (tag): 899 """Convert a tag to ``HB_TAG`` form. 900 901 Args: 902 tag (str): An OpenType tag. 903 904 Returns: 905 A snippet of C++ representing ``tag``. 906 """ 907 if tag == DEFAULT_LANGUAGE_SYSTEM: 908 return 'HB_TAG_NONE\t ' 909 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) 910 911def get_variant_set (name): 912 """Return a set of variant language names from a name. 913 914 Args: 915 name (str): A list of language names from the BCP 47 registry, 916 joined on ``'\\n'``. 917 918 Returns: 919 A set of normalized language names. 920 """ 921 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'")) 922 .encode ('ASCII', 'ignore') 923 .strip () 924 for n in re.split ('[\n(),]', name) if n) 925 926def language_name_intersection (a, b): 927 """Return the names in common between two language names. 928 929 Args: 930 a (str): A list of language names from the BCP 47 registry, 931 joined on ``'\\n'``. 932 b (str): A list of language names from the BCP 47 registry, 933 joined on ``'\\n'``. 934 935 Returns: 936 The normalized language names shared by ``a`` and ``b``. 937 """ 938 return get_variant_set (a).intersection (get_variant_set (b)) 939 940def get_matching_language_name (intersection, candidates): 941 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) 942 943def same_tag (bcp_47_tag, ot_tags): 944 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () 945 946for language_len in (2, 3): 947 if language_len == 3: 948 print ('#ifndef HB_NO_LANGUAGE_LONG') 949 print ('static const LangTag ot_languages%d[] = {' % language_len) 950 for language, tags in sorted (ot.from_bcp_47.items ()): 951 if language == '' or '-' in language: 952 continue 953 if len(language) != language_len: continue 954 commented_out = same_tag (language, tags) 955 for i, tag in enumerate (tags, start=1): 956 print ('%s{%s,\t%s},' % ('/*' if commented_out else ' ', hb_tag (language), hb_tag (tag)), end='') 957 if commented_out: 958 print ('*/', end='') 959 print ('\t/* ', end='') 960 bcp_47_name = bcp_47.names.get (language, '') 961 bcp_47_name_candidates = bcp_47_name.split ('\n') 962 ot_name = ot.names[tag] 963 scope = bcp_47.scopes.get (language, '') 964 if tag == DEFAULT_LANGUAGE_SYSTEM: 965 write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') 966 else: 967 intersection = language_name_intersection (bcp_47_name, ot_name) 968 if not intersection: 969 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name)) 970 else: 971 name = get_matching_language_name (intersection, bcp_47_name_candidates) 972 bcp_47.names[language] = name 973 write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) 974 print (' */') 975 print ('};') 976 if language_len == 3: 977 print ('#endif') 978 print () 979 980print ('/**') 981print (' * hb_ot_tags_from_complex_language:') 982print (' * @lang_str: a BCP 47 language tag to convert.') 983print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') 984print (' * conversion.') 985print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') 986print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') 987print (' * @tags: array of size at least @language_count to store the language tag') 988print (' * results') 989print (' *') 990print (' * Converts a multi-subtag BCP 47 language tag to language tags.') 991print (' *') 992print (' * Return value: Whether any language systems were retrieved.') 993print (' **/') 994print ('static inline bool') 995print ('hb_ot_tags_from_complex_language (const char *lang_str,') 996print ('\t\t\t\t const char *limit,') 997print ('\t\t\t\t unsigned int *count /* IN/OUT */,') 998print ('\t\t\t\t hb_tag_t *tags /* OUT */)') 999print ('{') 1000 1001def print_subtag_matches (subtag, string, new_line): 1002 if subtag: 1003 if new_line: 1004 print () 1005 print ('\t&& ', end='') 1006 print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='') 1007 1008complex_tags = collections.defaultdict (list) 1009for initial, group in itertools.groupby ((lt_tags for lt_tags in [ 1010 (LanguageTag (language), tags) 1011 for language, tags in sorted (ot.from_bcp_47.items (), 1012 key=lambda i: (-len (i[0]), i[0])) 1013 ] if lt_tags[0].is_complex ()), 1014 key=lambda lt_tags: lt_tags[0].get_group ()): 1015 complex_tags[initial] += group 1016 1017# Calculate the min length of the subtags outside the switch 1018min_subtag_len = 100 1019for initial, items in sorted (complex_tags.items ()): 1020 if initial != 'und': 1021 continue 1022 for lt, tags in items: 1023 if not tags: 1024 continue 1025 subtag_len = 0 1026 subtag_len += 1 + len (lt.script) if lt.script is not None else 0 1027 subtag_len += 1 + len (lt.region) if lt.region is not None else 0 1028 subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0 1029 min_subtag_len = min(subtag_len, min_subtag_len) 1030 1031print (' if (limit - lang_str >= %d)' % (min_subtag_len + 2)) 1032print (' {') 1033print (" const char *p = strchr (lang_str, '-');") 1034print (" if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len) 1035for initial, items in sorted (complex_tags.items ()): 1036 if initial != 'und': 1037 continue 1038 for lt, tags in items: 1039 if not tags: 1040 continue 1041 if lt.variant in bcp_47.prefixes: 1042 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, 1043 '%s is not a valid prefix of %s' % (lt.language, lt.variant)) 1044 print (' if (', end='') 1045 print_subtag_matches (lt.script, 'p', False) 1046 print_subtag_matches (lt.region, 'p', False) 1047 print_subtag_matches (lt.variant, 'p', False) 1048 print (')') 1049 print (' {') 1050 write (' /* %s */' % bcp_47.get_name (lt)) 1051 print () 1052 if len (tags) == 1: 1053 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1054 print () 1055 print (' *count = 1;') 1056 else: 1057 print (' hb_tag_t possible_tags[] = {') 1058 for tag in tags: 1059 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1060 print () 1061 print (' };') 1062 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1063 print ('\ttags[i] = possible_tags[i];') 1064 print (' *count = i;') 1065 print (' return true;') 1066 print (' }') 1067print (' }') 1068print ('out:') 1069 1070print (' switch (lang_str[0])') 1071print (' {') 1072for initial, items in sorted (complex_tags.items ()): 1073 if initial == 'und': 1074 continue 1075 print (" case '%s':" % initial) 1076 for lt, tags in items: 1077 if not tags: 1078 continue 1079 print (' if (', end='') 1080 script = lt.script 1081 region = lt.region 1082 if lt.grandfathered: 1083 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') 1084 else: 1085 string_literal = lt.language[1:] + '-' 1086 if script: 1087 string_literal += script 1088 script = None 1089 if region: 1090 string_literal += '-' + region 1091 region = None 1092 if string_literal[-1] == '-': 1093 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') 1094 else: 1095 print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='') 1096 print_subtag_matches (script, 'lang_str', True) 1097 print_subtag_matches (region, 'lang_str', True) 1098 print_subtag_matches (lt.variant, 'lang_str', True) 1099 print (')') 1100 print (' {') 1101 write (' /* %s */' % bcp_47.get_name (lt)) 1102 print () 1103 if len (tags) == 1: 1104 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1105 print () 1106 print (' *count = 1;') 1107 else: 1108 print (' unsigned int i;') 1109 print (' hb_tag_t possible_tags[] = {') 1110 for tag in tags: 1111 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1112 print () 1113 print (' };') 1114 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1115 print ('\ttags[i] = possible_tags[i];') 1116 print (' *count = i;') 1117 print (' return true;') 1118 print (' }') 1119 print (' break;') 1120 1121print (' }') 1122print (' return false;') 1123print ('}') 1124print () 1125print ('/**') 1126print (' * hb_ot_ambiguous_tag_to_language') 1127print (' * @tag: A language tag.') 1128print (' *') 1129print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') 1130print (' * many language tags) and the best tag is not the alphabetically first, or if') 1131print (' * the best tag consists of multiple subtags, or if the best tag does not appear') 1132print (' * in #ot_languages.') 1133print (' *') 1134print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') 1135print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') 1136print (' **/') 1137print ('static inline hb_language_t') 1138print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') 1139print ('{') 1140print (' switch (tag)') 1141print (' {') 1142 1143def verify_disambiguation_dict (): 1144 """Verify and normalize ``disambiguation``. 1145 1146 ``disambiguation`` is a map of ambiguous OpenType language system 1147 tags to the particular BCP 47 tags they correspond to. This function 1148 checks that all its keys really are ambiguous and that each key's 1149 value is valid for that key. It checks that no ambiguous tag is 1150 missing, except when it can figure out which BCP 47 tag is the best 1151 by itself. 1152 1153 It modifies ``disambiguation`` to remove keys whose values are the 1154 same as those that the fallback would return anyway, and to add 1155 ambiguous keys whose disambiguations it determined automatically. 1156 1157 Raises: 1158 AssertionError: Verification failed. 1159 """ 1160 global bcp_47 1161 global disambiguation 1162 global ot 1163 for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): 1164 if ot_tag == DEFAULT_LANGUAGE_SYSTEM: 1165 primary_tags = [] 1166 else: 1167 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) 1168 if len (primary_tags) == 1: 1169 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) 1170 if '-' in primary_tags[0]: 1171 disambiguation[ot_tag] = primary_tags[0] 1172 else: 1173 first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0] 1174 if primary_tags[0] != first_tag: 1175 disambiguation[ot_tag] = primary_tags[0] 1176 elif len (primary_tags) == 0: 1177 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) 1178 else: 1179 original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] 1180 if len (original_languages) == 1: 1181 macrolanguages = original_languages 1182 else: 1183 macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] 1184 if len (macrolanguages) != 1: 1185 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]') 1186 if len (macrolanguages) != 1: 1187 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) 1188 if len (macrolanguages) != 1: 1189 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) 1190 expect (disambiguation[ot_tag] in bcp_47_tags, 1191 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) 1192 elif ot_tag not in disambiguation: 1193 disambiguation[ot_tag] = macrolanguages[0] 1194 different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))) 1195 if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]: 1196 del disambiguation[ot_tag] 1197 for ot_tag in disambiguation.keys (): 1198 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) 1199 1200verify_disambiguation_dict () 1201for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): 1202 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) 1203 print () 1204 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) 1205 print () 1206 1207print (' default:') 1208print (' return HB_LANGUAGE_INVALID;') 1209print (' }') 1210print ('}') 1211 1212print () 1213print ('#endif /* HB_OT_TAG_TABLE_HH */') 1214print () 1215print ('/* == End of generated table == */') 1216 1217