1#!/usr/bin/python 2 3"""Generator of the mapping from OpenType tags to BCP 47 tags and vice 4versa. 5 6It creates a ``const LangTag[]``, matching the tags from the OpenType 7languages system tag list to the language subtags of the BCP 47 language 8subtag registry, with some manual adjustments. The mappings are 9supplemented with macrolanguages' sublanguages and retired codes' 10replacements, according to BCP 47 and some manual additions where BCP 47 11omits a retired code entirely. 12 13Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, 14intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags 15back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to 16multiple BCP 47 tags) are listed here, except when the alphabetically 17first BCP 47 tag happens to be the chosen disambiguated tag. In that 18case, the fallback behavior will choose the right tag anyway. 19""" 20 21from __future__ import absolute_import, division, print_function, unicode_literals 22 23import collections 24try: 25 from HTMLParser import HTMLParser 26 def write (s): 27 print (s.encode ('utf-8'), end='') 28except ImportError: 29 from html.parser import HTMLParser 30 def write (s): 31 sys.stdout.flush () 32 sys.stdout.buffer.write (s.encode ('utf-8')) 33import io 34import itertools 35import re 36import sys 37import unicodedata 38 39if len (sys.argv) != 3: 40 print ('usage: ./gen-tag-table.py languagetags language-subtag-registry', file=sys.stderr) 41 sys.exit (1) 42 43try: 44 from html import unescape 45 def html_unescape (parser, entity): 46 return unescape (entity) 47except ImportError: 48 def html_unescape (parser, entity): 49 return parser.unescape (entity) 50 51def expect (condition, message=None): 52 if not condition: 53 if message is None: 54 raise AssertionError 55 raise AssertionError (message) 56 57# from http://www-01.sil.org/iso639-3/iso-639-3.tab 58ISO_639_3_TO_1 = { 59 'aar': 'aa', 60 'abk': 'ab', 61 'afr': 'af', 62 'aka': 'ak', 63 'amh': 'am', 64 'ara': 'ar', 65 'arg': 'an', 66 'asm': 'as', 67 'ava': 'av', 68 'ave': 'ae', 69 'aym': 'ay', 70 'aze': 'az', 71 'bak': 'ba', 72 'bam': 'bm', 73 'bel': 'be', 74 'ben': 'bn', 75 'bis': 'bi', 76 'bod': 'bo', 77 'bos': 'bs', 78 'bre': 'br', 79 'bul': 'bg', 80 'cat': 'ca', 81 'ces': 'cs', 82 'cha': 'ch', 83 'che': 'ce', 84 'chu': 'cu', 85 'chv': 'cv', 86 'cor': 'kw', 87 'cos': 'co', 88 'cre': 'cr', 89 'cym': 'cy', 90 'dan': 'da', 91 'deu': 'de', 92 'div': 'dv', 93 'dzo': 'dz', 94 'ell': 'el', 95 'eng': 'en', 96 'epo': 'eo', 97 'est': 'et', 98 'eus': 'eu', 99 'ewe': 'ee', 100 'fao': 'fo', 101 'fas': 'fa', 102 'fij': 'fj', 103 'fin': 'fi', 104 'fra': 'fr', 105 'fry': 'fy', 106 'ful': 'ff', 107 'gla': 'gd', 108 'gle': 'ga', 109 'glg': 'gl', 110 'glv': 'gv', 111 'grn': 'gn', 112 'guj': 'gu', 113 'hat': 'ht', 114 'hau': 'ha', 115 'hbs': 'sh', 116 'heb': 'he', 117 'her': 'hz', 118 'hin': 'hi', 119 'hmo': 'ho', 120 'hrv': 'hr', 121 'hun': 'hu', 122 'hye': 'hy', 123 'ibo': 'ig', 124 'ido': 'io', 125 'iii': 'ii', 126 'iku': 'iu', 127 'ile': 'ie', 128 'ina': 'ia', 129 'ind': 'id', 130 'ipk': 'ik', 131 'isl': 'is', 132 'ita': 'it', 133 'jav': 'jv', 134 'jpn': 'ja', 135 'kal': 'kl', 136 'kan': 'kn', 137 'kas': 'ks', 138 'kat': 'ka', 139 'kau': 'kr', 140 'kaz': 'kk', 141 'khm': 'km', 142 'kik': 'ki', 143 'kin': 'rw', 144 'kir': 'ky', 145 'kom': 'kv', 146 'kon': 'kg', 147 'kor': 'ko', 148 'kua': 'kj', 149 'kur': 'ku', 150 'lao': 'lo', 151 'lat': 'la', 152 'lav': 'lv', 153 'lim': 'li', 154 'lin': 'ln', 155 'lit': 'lt', 156 'ltz': 'lb', 157 'lub': 'lu', 158 'lug': 'lg', 159 'mah': 'mh', 160 'mal': 'ml', 161 'mar': 'mr', 162 'mkd': 'mk', 163 'mlg': 'mg', 164 'mlt': 'mt', 165 'mol': 'mo', 166 'mon': 'mn', 167 'mri': 'mi', 168 'msa': 'ms', 169 'mya': 'my', 170 'nau': 'na', 171 'nav': 'nv', 172 'nbl': 'nr', 173 'nde': 'nd', 174 'ndo': 'ng', 175 'nep': 'ne', 176 'nld': 'nl', 177 'nno': 'nn', 178 'nob': 'nb', 179 'nor': 'no', 180 'nya': 'ny', 181 'oci': 'oc', 182 'oji': 'oj', 183 'ori': 'or', 184 'orm': 'om', 185 'oss': 'os', 186 'pan': 'pa', 187 'pli': 'pi', 188 'pol': 'pl', 189 'por': 'pt', 190 'pus': 'ps', 191 'que': 'qu', 192 'roh': 'rm', 193 'ron': 'ro', 194 'run': 'rn', 195 'rus': 'ru', 196 'sag': 'sg', 197 'san': 'sa', 198 'sin': 'si', 199 'slk': 'sk', 200 'slv': 'sl', 201 'sme': 'se', 202 'smo': 'sm', 203 'sna': 'sn', 204 'snd': 'sd', 205 'som': 'so', 206 'sot': 'st', 207 'spa': 'es', 208 'sqi': 'sq', 209 'srd': 'sc', 210 'srp': 'sr', 211 'ssw': 'ss', 212 'sun': 'su', 213 'swa': 'sw', 214 'swe': 'sv', 215 'tah': 'ty', 216 'tam': 'ta', 217 'tat': 'tt', 218 'tel': 'te', 219 'tgk': 'tg', 220 'tgl': 'tl', 221 'tha': 'th', 222 'tir': 'ti', 223 'ton': 'to', 224 'tsn': 'tn', 225 'tso': 'ts', 226 'tuk': 'tk', 227 'tur': 'tr', 228 'twi': 'tw', 229 'uig': 'ug', 230 'ukr': 'uk', 231 'urd': 'ur', 232 'uzb': 'uz', 233 'ven': 've', 234 'vie': 'vi', 235 'vol': 'vo', 236 'wln': 'wa', 237 'wol': 'wo', 238 'xho': 'xh', 239 'yid': 'yi', 240 'yor': 'yo', 241 'zha': 'za', 242 'zho': 'zh', 243 'zul': 'zu', 244} 245 246class LanguageTag (object): 247 """A BCP 47 language tag. 248 249 Attributes: 250 subtags (List[str]): The list of subtags in this tag. 251 grandfathered (bool): Whether this tag is grandfathered. If 252 ``true``, the entire lowercased tag is the ``language`` 253 and the other subtag fields are empty. 254 language (str): The language subtag. 255 script (str): The script subtag. 256 region (str): The region subtag. 257 variant (str): The variant subtag. 258 259 Args: 260 tag (str): A BCP 47 language tag. 261 262 """ 263 def __init__ (self, tag): 264 global bcp_47 265 self.subtags = tag.lower ().split ('-') 266 self.grandfathered = tag.lower () in bcp_47.grandfathered 267 if self.grandfathered: 268 self.language = tag.lower () 269 self.script = '' 270 self.region = '' 271 self.variant = '' 272 else: 273 self.language = self.subtags[0] 274 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) 275 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) 276 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) 277 278 def __str__(self): 279 return '-'.join(self.subtags) 280 281 def __repr__ (self): 282 return 'LanguageTag(%r)' % str(self) 283 284 @staticmethod 285 def _find_first (function, sequence): 286 try: 287 return next (iter (filter (function, sequence))) 288 except StopIteration: 289 return None 290 291 def is_complex (self): 292 """Return whether this tag is too complex to represent as a 293 ``LangTag`` in the generated code. 294 295 Complex tags need to be handled in 296 ``hb_ot_tags_from_complex_language``. 297 298 Returns: 299 Whether this tag is complex. 300 """ 301 return not (len (self.subtags) == 1 302 or self.grandfathered 303 and len (self.subtags[1]) != 3 304 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) 305 306 def get_group (self): 307 """Return the group into which this tag should be categorized in 308 ``hb_ot_tags_from_complex_language``. 309 310 The group is the first letter of the tag, or ``'und'`` if this tag 311 should not be matched in a ``switch`` statement in the generated 312 code. 313 314 Returns: 315 This tag's group. 316 """ 317 return ('und' 318 if (self.language == 'und' 319 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) 320 else self.language[0]) 321 322class OpenTypeRegistryParser (HTMLParser): 323 """A parser for the OpenType language system tag registry. 324 325 Attributes: 326 header (str): The "last updated" line of the registry. 327 names (Mapping[str, str]): A map of language system tags to the 328 names they are given in the registry. 329 ranks (DefaultDict[str, int]): A map of language system tags to 330 numbers. If a single BCP 47 tag corresponds to multiple 331 OpenType tags, the tags are ordered in increasing order by 332 rank. The rank is based on the number of BCP 47 tags 333 associated with a tag, though it may be manually modified. 334 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of 335 OpenType language system tags to sets of BCP 47 tags. 336 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` 337 inverted. Its values start as unsorted sets; 338 ``sort_languages`` converts them to sorted lists. 339 340 """ 341 def __init__ (self): 342 HTMLParser.__init__ (self) 343 self.header = '' 344 self.names = {} 345 self.ranks = collections.defaultdict (int) 346 self.to_bcp_47 = collections.defaultdict (set) 347 self.from_bcp_47 = collections.defaultdict (set) 348 # Whether the parser is in a <td> element 349 self._td = False 350 # The text of the <td> elements of the current <tr> element. 351 self._current_tr = [] 352 353 def handle_starttag (self, tag, attrs): 354 if tag == 'meta': 355 for attr, value in attrs: 356 if attr == 'name' and value == 'updated_at': 357 self.header = self.get_starttag_text () 358 break 359 elif tag == 'td': 360 self._td = True 361 self._current_tr.append ('') 362 elif tag == 'tr': 363 self._current_tr = [] 364 365 def handle_endtag (self, tag): 366 if tag == 'td': 367 self._td = False 368 elif tag == 'tr' and self._current_tr: 369 expect (2 <= len (self._current_tr) <= 3) 370 name = self._current_tr[0].strip () 371 tag = self._current_tr[1].strip ("\t\n\v\f\r '") 372 rank = 0 373 if len (tag) > 4: 374 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) 375 name += ' (deprecated)' 376 tag = tag.split (' ')[0] 377 rank = 1 378 self.names[tag] = re.sub (' languages$', '', name) 379 if not self._current_tr[2]: 380 return 381 iso_codes = self._current_tr[2].strip () 382 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) 383 rank += 2 * len (self.to_bcp_47[tag]) 384 self.ranks[tag] = rank 385 386 def handle_data (self, data): 387 if self._td: 388 self._current_tr[-1] += data 389 390 def handle_charref (self, name): 391 self.handle_data (html_unescape (self, '&#%s;' % name)) 392 393 def handle_entityref (self, name): 394 self.handle_data (html_unescape (self, '&%s;' % name)) 395 396 def parse (self, filename): 397 """Parse the OpenType language system tag registry. 398 399 Args: 400 filename (str): The file name of the registry. 401 """ 402 with io.open (filename, encoding='utf-8') as f: 403 self.feed (f.read ()) 404 expect (self.header) 405 for tag, iso_codes in self.to_bcp_47.items (): 406 for iso_code in iso_codes: 407 self.from_bcp_47[iso_code].add (tag) 408 409 def add_language (self, bcp_47_tag, ot_tag): 410 """Add a language as if it were in the registry. 411 412 Args: 413 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just 414 a language subtag, and if the language subtag is a 415 macrolanguage, then new languages are added corresponding 416 to the macrolanguages' individual languages with the 417 remainder of the tag appended. 418 ot_tag (str): An OpenType language system tag. 419 """ 420 global bcp_47 421 self.to_bcp_47[ot_tag].add (bcp_47_tag) 422 self.from_bcp_47[bcp_47_tag].add (ot_tag) 423 if bcp_47_tag.lower () not in bcp_47.grandfathered: 424 try: 425 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) 426 if macrolanguage in bcp_47.macrolanguages: 427 s = set () 428 for language in bcp_47.macrolanguages[macrolanguage]: 429 if language.lower () not in bcp_47.grandfathered: 430 s.add ('%s-%s' % (language, suffix)) 431 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s 432 except ValueError: 433 pass 434 435 @staticmethod 436 def _remove_language (tag_1, dict_1, dict_2): 437 for tag_2 in dict_1.pop (tag_1): 438 dict_2[tag_2].remove (tag_1) 439 if not dict_2[tag_2]: 440 del dict_2[tag_2] 441 442 def remove_language_ot (self, ot_tag): 443 """Remove an OpenType tag from the registry. 444 445 Args: 446 ot_tag (str): An OpenType tag. 447 """ 448 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) 449 450 def remove_language_bcp_47 (self, bcp_47_tag): 451 """Remove a BCP 47 tag from the registry. 452 453 Args: 454 bcp_47_tag (str): A BCP 47 tag. 455 """ 456 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) 457 458 def inherit_from_macrolanguages (self): 459 """Copy mappings from macrolanguages to individual languages. 460 461 If a BCP 47 tag for an individual mapping has no OpenType 462 mapping but its macrolanguage does, the mapping is copied to 463 the individual language. For example, als (Tosk Albanian) has no 464 explicit mapping, so it inherits from sq (Albanian) the mapping 465 to SQI. 466 467 If a BCP 47 tag for a macrolanguage has no OpenType mapping but 468 all of its individual languages do and they all map to the same 469 tags, the mapping is copied to the macrolanguage. 470 """ 471 global bcp_47 472 original_ot_from_bcp_47 = dict (self.from_bcp_47) 473 for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): 474 ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) 475 if ot_macrolanguages: 476 for ot_macrolanguage in ot_macrolanguages: 477 for language in languages: 478 # Remove the following condition if e.g. nn should map to NYN,NOR 479 # instead of just NYN. 480 if language not in original_ot_from_bcp_47: 481 self.add_language (language, ot_macrolanguage) 482 self.ranks[ot_macrolanguage] += 1 483 else: 484 for language in languages: 485 if language in original_ot_from_bcp_47: 486 if ot_macrolanguages: 487 ml = original_ot_from_bcp_47[language] 488 if ml: 489 ot_macrolanguages &= ml 490 else: 491 pass 492 else: 493 ot_macrolanguages |= original_ot_from_bcp_47[language] 494 else: 495 ot_macrolanguages.clear () 496 if not ot_macrolanguages: 497 break 498 for ot_macrolanguage in ot_macrolanguages: 499 self.add_language (macrolanguage, ot_macrolanguage) 500 501 def sort_languages (self): 502 """Sort the values of ``from_bcp_47`` in ascending rank order.""" 503 for language, tags in self.from_bcp_47.items (): 504 self.from_bcp_47[language] = sorted (tags, 505 key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) 506 507ot = OpenTypeRegistryParser () 508 509class BCP47Parser (object): 510 """A parser for the BCP 47 subtag registry. 511 512 Attributes: 513 header (str): The "File-Date" line of the registry. 514 names (Mapping[str, str]): A map of subtags to the names they 515 are given in the registry. Each value is a 516 ``'\\n'``-separated list of names. 517 scopes (Mapping[str, str]): A map of language subtags to strings 518 suffixed to language names, including suffixes to explain 519 language scopes. 520 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of 521 language subtags to the sets of language subtags which 522 inherit from them. See 523 ``OpenTypeRegistryParser.inherit_from_macrolanguages``. 524 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant 525 subtags to their prefixes. 526 grandfathered (AbstractSet[str]): The set of grandfathered tags, 527 normalized to lowercase. 528 529 """ 530 def __init__ (self): 531 self.header = '' 532 self.names = {} 533 self.scopes = {} 534 self.macrolanguages = collections.defaultdict (set) 535 self.prefixes = collections.defaultdict (set) 536 self.grandfathered = set () 537 538 def parse (self, filename): 539 """Parse the BCP 47 subtag registry. 540 541 Args: 542 filename (str): The file name of the registry. 543 """ 544 with io.open (filename, encoding='utf-8') as f: 545 subtag_type = None 546 subtag = None 547 deprecated = False 548 has_preferred_value = False 549 line_buffer = '' 550 for line in itertools.chain (f, ['']): 551 line = line.rstrip () 552 if line.startswith (' '): 553 line_buffer += line[1:] 554 continue 555 line, line_buffer = line_buffer, line 556 if line.startswith ('Type: '): 557 subtag_type = line.split (' ')[1] 558 deprecated = False 559 has_preferred_value = False 560 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): 561 subtag = line.split (' ')[1] 562 if subtag_type == 'grandfathered': 563 self.grandfathered.add (subtag.lower ()) 564 elif line.startswith ('Description: '): 565 description = line.split (' ', 1)[1].replace (' (individual language)', '') 566 description = re.sub (' (\((individual |macro)language\)|languages)$', '', 567 description) 568 if subtag in self.names: 569 self.names[subtag] += '\n' + description 570 else: 571 self.names[subtag] = description 572 elif subtag_type == 'language' or subtag_type == 'grandfathered': 573 if line.startswith ('Scope: '): 574 scope = line.split (' ')[1] 575 if scope == 'macrolanguage': 576 scope = ' [macrolanguage]' 577 elif scope == 'collection': 578 scope = ' [family]' 579 else: 580 continue 581 self.scopes[subtag] = scope 582 elif line.startswith ('Deprecated: '): 583 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 584 deprecated = True 585 elif deprecated and line.startswith ('Comments: see '): 586 # If a subtag is split into multiple replacement subtags, 587 # it essentially represents a macrolanguage. 588 for language in line.replace (',', '').split (' ')[2:]: 589 self._add_macrolanguage (subtag, language) 590 elif line.startswith ('Preferred-Value: '): 591 # If a subtag is deprecated in favor of a single replacement subtag, 592 # it is either a dialect or synonym of the preferred subtag. Either 593 # way, it is close enough to the truth to consider the replacement 594 # the macrolanguage of the deprecated language. 595 has_preferred_value = True 596 macrolanguage = line.split (' ')[1] 597 self._add_macrolanguage (macrolanguage, subtag) 598 elif not has_preferred_value and line.startswith ('Macrolanguage: '): 599 self._add_macrolanguage (line.split (' ')[1], subtag) 600 elif subtag_type == 'variant': 601 if line.startswith ('Prefix: '): 602 self.prefixes[subtag].add (line.split (' ')[1]) 603 elif line.startswith ('File-Date: '): 604 self.header = line 605 expect (self.header) 606 607 def _add_macrolanguage (self, macrolanguage, language): 608 global ot 609 if language not in ot.from_bcp_47: 610 for l in self.macrolanguages.get (language, set ()): 611 self._add_macrolanguage (macrolanguage, l) 612 if macrolanguage not in ot.from_bcp_47: 613 for ls in list (self.macrolanguages.values ()): 614 if macrolanguage in ls: 615 ls.add (language) 616 return 617 self.macrolanguages[macrolanguage].add (language) 618 619 def remove_extra_macrolanguages (self): 620 """Make every language have at most one macrolanguage.""" 621 inverted = collections.defaultdict (list) 622 for macrolanguage, languages in self.macrolanguages.items (): 623 for language in languages: 624 inverted[language].append (macrolanguage) 625 for language, macrolanguages in inverted.items (): 626 if len (macrolanguages) > 1: 627 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) 628 biggest_macrolanguage = macrolanguages.pop () 629 for macrolanguage in macrolanguages: 630 self._add_macrolanguage (biggest_macrolanguage, macrolanguage) 631 632 def get_name (self, lt): 633 """Return the names of the subtags in a language tag. 634 635 Args: 636 lt (LanguageTag): A BCP 47 language tag. 637 638 Returns: 639 The name form of ``lt``. 640 """ 641 name = self.names[lt.language].split ('\n')[0] 642 if lt.script: 643 name += '; ' + self.names[lt.script.title ()].split ('\n')[0] 644 if lt.region: 645 name += '; ' + self.names[lt.region.upper ()].split ('\n')[0] 646 if lt.variant: 647 name += '; ' + self.names[lt.variant].split ('\n')[0] 648 return name 649 650bcp_47 = BCP47Parser () 651 652ot.parse (sys.argv[1]) 653bcp_47.parse (sys.argv[2]) 654 655ot.add_language ('ary', 'MOR') 656 657ot.add_language ('ath', 'ATH') 658 659ot.add_language ('bai', 'BML') 660 661ot.ranks['BAL'] = ot.ranks['KAR'] + 1 662 663ot.add_language ('ber', 'BBR') 664 665ot.remove_language_ot ('PGR') 666ot.add_language ('el-polyton', 'PGR') 667 668bcp_47.macrolanguages['et'] = {'ekk'} 669 670bcp_47.names['flm'] = 'Falam Chin' 671bcp_47.scopes['flm'] = ' (retired code)' 672bcp_47.macrolanguages['flm'] = {'cfm'} 673 674ot.ranks['FNE'] = ot.ranks['TNE'] + 1 675 676ot.add_language ('und-fonipa', 'IPPH') 677 678ot.add_language ('und-fonnapa', 'APPH') 679 680ot.remove_language_ot ('IRT') 681ot.add_language ('ga-Latg', 'IRT') 682 683ot.remove_language_ot ('KGE') 684ot.add_language ('und-Geok', 'KGE') 685 686ot.add_language ('guk', 'GUK') 687ot.names['GUK'] = 'Gumuz (SIL fonts)' 688ot.ranks['GUK'] = ot.ranks['GMZ'] + 1 689 690bcp_47.macrolanguages['id'] = {'in'} 691 692bcp_47.macrolanguages['ijo'] = {'ijc'} 693 694ot.add_language ('kht', 'KHN') 695ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' 696ot.names['KHT'] = ot.names['KHT'] + ' (OpenType spec and SIL fonts)' 697ot.ranks['KHN'] = ot.ranks['KHT'] 698ot.ranks['KHT'] += 1 699 700ot.ranks['LCR'] = ot.ranks['MCR'] + 1 701 702ot.names['MAL'] = 'Malayalam Traditional' 703ot.ranks['MLR'] += 1 704 705bcp_47.names['mhv'] = 'Arakanese' 706bcp_47.scopes['mhv'] = ' (retired code)' 707 708ot.add_language ('no', 'NOR') 709 710ot.add_language ('oc-provenc', 'PRO') 711 712ot.add_language ('qu', 'QUZ') 713ot.add_language ('qub', 'QWH') 714ot.add_language ('qud', 'QVI') 715ot.add_language ('qug', 'QVI') 716ot.add_language ('qup', 'QVI') 717ot.add_language ('qur', 'QWH') 718ot.add_language ('qus', 'QUH') 719ot.add_language ('quw', 'QVI') 720ot.add_language ('qux', 'QWH') 721ot.add_language ('qva', 'QWH') 722ot.add_language ('qvh', 'QWH') 723ot.add_language ('qvj', 'QVI') 724ot.add_language ('qvl', 'QWH') 725ot.add_language ('qvm', 'QWH') 726ot.add_language ('qvn', 'QWH') 727ot.add_language ('qvo', 'QVI') 728ot.add_language ('qvp', 'QWH') 729ot.add_language ('qvw', 'QWH') 730ot.add_language ('qvz', 'QVI') 731ot.add_language ('qwa', 'QWH') 732ot.add_language ('qws', 'QWH') 733ot.add_language ('qxa', 'QWH') 734ot.add_language ('qxc', 'QWH') 735ot.add_language ('qxh', 'QWH') 736ot.add_language ('qxl', 'QVI') 737ot.add_language ('qxn', 'QWH') 738ot.add_language ('qxo', 'QWH') 739ot.add_language ('qxr', 'QVI') 740ot.add_language ('qxt', 'QWH') 741ot.add_language ('qxw', 'QWH') 742 743bcp_47.macrolanguages['ro'].remove ('mo') 744bcp_47.macrolanguages['ro-MD'].add ('mo') 745 746ot.add_language ('sgw', 'SGW') 747ot.names['SGW'] = ot.names['CHG'] + ' (SIL fonts)' 748ot.ranks['SGW'] = ot.ranks['CHG'] + 1 749 750ot.remove_language_ot ('SYRE') 751ot.remove_language_ot ('SYRJ') 752ot.remove_language_ot ('SYRN') 753ot.add_language ('und-Syre', 'SYRE') 754ot.add_language ('und-Syrj', 'SYRJ') 755ot.add_language ('und-Syrn', 'SYRN') 756 757bcp_47.names['xst'] = u"Silt'e" 758bcp_47.scopes['xst'] = ' (retired code)' 759bcp_47.macrolanguages['xst'] = {'stv', 'wle'} 760 761ot.add_language ('xwo', 'TOD') 762 763ot.remove_language_ot ('ZHH') 764ot.remove_language_ot ('ZHP') 765ot.remove_language_ot ('ZHT') 766bcp_47.macrolanguages['zh'].remove ('lzh') 767bcp_47.macrolanguages['zh'].remove ('yue') 768ot.add_language ('zh-Hant-MO', 'ZHH') 769ot.add_language ('zh-Hant-HK', 'ZHH') 770ot.add_language ('zh-Hans', 'ZHS') 771ot.add_language ('zh-Hant', 'ZHT') 772ot.add_language ('zh-HK', 'ZHH') 773ot.add_language ('zh-MO', 'ZHH') 774ot.add_language ('zh-TW', 'ZHT') 775ot.add_language ('lzh', 'ZHT') 776ot.add_language ('lzh-Hans', 'ZHS') 777ot.add_language ('yue', 'ZHH') 778ot.add_language ('yue-Hans', 'ZHS') 779 780bcp_47.macrolanguages['zom'] = {'yos'} 781 782def rank_delta (bcp_47, ot): 783 """Return a delta to apply to a BCP 47 tag's rank. 784 785 Most OpenType tags have a constant rank, but a few have ranks that 786 depend on the BCP 47 tag. 787 788 Args: 789 bcp_47 (str): A BCP 47 tag. 790 ot (str): An OpenType tag to. 791 792 Returns: 793 A number to add to ``ot``'s rank when sorting ``bcp_47``'s 794 OpenType equivalents. 795 """ 796 if bcp_47 == 'ak' and ot == 'AKA': 797 return -1 798 if bcp_47 == 'tw' and ot == 'TWI': 799 return -1 800 return 0 801 802disambiguation = { 803 'ALT': 'alt', 804 'ARK': 'rki', 805 'BHI': 'bhb', 806 'BLN': 'bjt', 807 'BTI': 'beb', 808 'CCHN': 'cco', 809 'CMR': 'swb', 810 'CPP': 'crp', 811 'CRR': 'crx', 812 'DUJ': 'dwu', 813 'ECR': 'crj', 814 'HAL': 'cfm', 815 'HND': 'hnd', 816 'KIS': 'kqs', 817 'LRC': 'bqi', 818 'NDB': 'nd', 819 'NIS': 'njz', 820 'PLG': 'pce', 821 'PRO': 'pro', 822 'QIN': 'bgr', 823 'QUH': 'quh', 824 'QVI': 'qvi', 825 'QWH': 'qwh', 826 'SIG': 'stv', 827 'TNE': 'yrk', 828 'ZHH': 'zh-HK', 829 'ZHS': 'zh-Hans', 830 'ZHT': 'zh-Hant', 831} 832 833ot.inherit_from_macrolanguages () 834bcp_47.remove_extra_macrolanguages () 835ot.inherit_from_macrolanguages () 836ot.sort_languages () 837 838print ('/* == Start of generated table == */') 839print ('/*') 840print (' * The following table is generated by running:') 841print (' *') 842print (' * %s languagetags language-subtag-registry' % sys.argv[0]) 843print (' *') 844print (' * on files with these headers:') 845print (' *') 846print (' * %s' % ot.header.strip ()) 847print (' * %s' % bcp_47.header) 848print (' */') 849print () 850print ('#ifndef HB_OT_TAG_TABLE_HH') 851print ('#define HB_OT_TAG_TABLE_HH') 852print () 853print ('static const LangTag ot_languages[] = {') 854 855def hb_tag (tag): 856 """Convert a tag to ``HB_TAG`` form. 857 858 Args: 859 tag (str): An OpenType tag. 860 861 Returns: 862 A snippet of C++ representing ``tag``. 863 """ 864 return u"HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) 865 866def get_variant_set (name): 867 """Return a set of variant language names from a name. 868 869 Args: 870 name (str): A list of language names from the BCP 47 registry, 871 joined on ``'\\n'``. 872 873 Returns: 874 A set of normalized language names. 875 """ 876 return set (unicodedata.normalize ('NFD', n.replace ('\u2019', u"'")) 877 .encode ('ASCII', 'ignore') 878 .strip () 879 for n in re.split ('[\n(),]', name) if n) 880 881def language_name_intersection (a, b): 882 """Return the names in common between two language names. 883 884 Args: 885 a (str): A list of language names from the BCP 47 registry, 886 joined on ``'\\n'``. 887 b (str): A list of language names from the BCP 47 registry, 888 joined on ``'\\n'``. 889 890 Returns: 891 The normalized language names shared by ``a`` and ``b``. 892 """ 893 return get_variant_set (a).intersection (get_variant_set (b)) 894 895def get_matching_language_name (intersection, candidates): 896 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) 897 898def same_tag (bcp_47_tag, ot_tags): 899 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () 900 901for language, tags in sorted (ot.from_bcp_47.items ()): 902 if language == '' or '-' in language: 903 continue 904 commented_out = same_tag (language, tags) 905 for i, tag in enumerate (tags, start=1): 906 print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='') 907 if commented_out: 908 print ('*/', end='') 909 print ('\t/* ', end='') 910 bcp_47_name = bcp_47.names.get (language, '') 911 bcp_47_name_candidates = bcp_47_name.split ('\n') 912 intersection = language_name_intersection (bcp_47_name, ot.names[tag]) 913 scope = bcp_47.scopes.get (language, '') 914 if not intersection: 915 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag])) 916 else: 917 name = get_matching_language_name (intersection, bcp_47_name_candidates) 918 bcp_47.names[language] = name 919 write ('%s%s' % (name if len (name) > len (ot.names[tag]) else ot.names[tag], scope)) 920 print (' */') 921 922print ('};') 923print () 924 925print ('/**') 926print (' * hb_ot_tags_from_complex_language:') 927print (' * @lang_str: a BCP 47 language tag to convert.') 928print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') 929print (' * conversion.') 930print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') 931print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') 932print (' * @tags: array of size at least @language_count to store the language tag') 933print (' * results') 934print (' *') 935print (' * Converts a multi-subtag BCP 47 language tag to language tags.') 936print (' *') 937print (' * Return value: Whether any language systems were retrieved.') 938print (' **/') 939print ('static bool') 940print ('hb_ot_tags_from_complex_language (const char *lang_str,') 941print ('\t\t\t\t const char *limit,') 942print ('\t\t\t\t unsigned int *count /* IN/OUT */,') 943print ('\t\t\t\t hb_tag_t *tags /* OUT */)') 944print ('{') 945 946def print_subtag_matches (subtag, new_line): 947 if subtag: 948 if new_line: 949 print () 950 print ('\t&& ', end='') 951 print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='') 952 953complex_tags = collections.defaultdict (list) 954for initial, group in itertools.groupby ((lt_tags for lt_tags in [ 955 (LanguageTag (language), tags) 956 for language, tags in sorted (ot.from_bcp_47.items (), 957 key=lambda i: (-len (i[0]), i[0])) 958 ] if lt_tags[0].is_complex ()), 959 key=lambda lt_tags: lt_tags[0].get_group ()): 960 complex_tags[initial] += group 961 962for initial, items in sorted (complex_tags.items ()): 963 if initial != 'und': 964 continue 965 for lt, tags in items: 966 if lt.variant in bcp_47.prefixes: 967 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, 968 '%s is not a valid prefix of %s' % (lt.language, lt.variant)) 969 print (' if (', end='') 970 print_subtag_matches (lt.script, False) 971 print_subtag_matches (lt.region, False) 972 print_subtag_matches (lt.variant, False) 973 print (')') 974 print (' {') 975 write (' /* %s */' % bcp_47.get_name (lt)) 976 print () 977 if len (tags) == 1: 978 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 979 print () 980 print (' *count = 1;') 981 else: 982 print (' hb_tag_t possible_tags[] = {') 983 for tag in tags: 984 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) 985 print () 986 print (' };') 987 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 988 print (' tags[i] = possible_tags[i];') 989 print (' *count = i;') 990 print (' return true;') 991 print (' }') 992 993print (' switch (lang_str[0])') 994print (' {') 995for initial, items in sorted (complex_tags.items ()): 996 if initial == 'und': 997 continue 998 print (" case '%s':" % initial) 999 for lt, tags in items: 1000 print (' if (', end='') 1001 if lt.grandfathered: 1002 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') 1003 else: 1004 string_literal = lt.language[1:] + '-' 1005 if lt.script: 1006 string_literal += lt.script 1007 lt.script = None 1008 if lt.region: 1009 string_literal += '-' + lt.region 1010 lt.region = None 1011 if string_literal[-1] == '-': 1012 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') 1013 else: 1014 print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='') 1015 print_subtag_matches (lt.script, True) 1016 print_subtag_matches (lt.region, True) 1017 print_subtag_matches (lt.variant, True) 1018 print (')') 1019 print (' {') 1020 write (' /* %s */' % bcp_47.get_name (lt)) 1021 print () 1022 if len (tags) == 1: 1023 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1024 print () 1025 print (' *count = 1;') 1026 else: 1027 print (' unsigned int i;') 1028 print (' hb_tag_t possible_tags[] = {') 1029 for tag in tags: 1030 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1031 print () 1032 print (' };') 1033 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1034 print ('\ttags[i] = possible_tags[i];') 1035 print (' *count = i;') 1036 print (' return true;') 1037 print (' }') 1038 print (' break;') 1039 1040print (' }') 1041print (' return false;') 1042print ('}') 1043print () 1044print ('/**') 1045print (' * hb_ot_ambiguous_tag_to_language') 1046print (' * @tag: A language tag.') 1047print (' *') 1048print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') 1049print (' * many language tags) and the best tag is not the alphabetically first, or if') 1050print (' * the best tag consists of multiple subtags, or if the best tag does not appear') 1051print (' * in #ot_languages.') 1052print (' *') 1053print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') 1054print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') 1055print (' **/') 1056print ('static hb_language_t') 1057print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') 1058print ('{') 1059print (' switch (tag)') 1060print (' {') 1061 1062def verify_disambiguation_dict (): 1063 """Verify and normalize ``disambiguation``. 1064 1065 ``disambiguation`` is a map of ambiguous OpenType language system 1066 tags to the particular BCP 47 tags they correspond to. This function 1067 checks that all its keys really are ambiguous and that each key's 1068 value is valid for that key. It checks that no ambiguous tag is 1069 missing, except when it can figure out which BCP 47 tag is the best 1070 by itself. 1071 1072 It modifies ``disambiguation`` to remove keys whose values are the 1073 same as those that the fallback would return anyway, and to add 1074 ambiguous keys whose disambiguations it determined automatically. 1075 1076 Raises: 1077 AssertionError: Verification failed. 1078 """ 1079 global bcp_47 1080 global disambiguation 1081 global ot 1082 for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): 1083 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) 1084 if len (primary_tags) == 1: 1085 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) 1086 if '-' in primary_tags[0]: 1087 disambiguation[ot_tag] = primary_tags[0] 1088 elif len (primary_tags) == 0: 1089 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) 1090 else: 1091 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') 1092 if len (macrolanguages) != 1: 1093 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') 1094 if len (macrolanguages) != 1: 1095 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) 1096 if len (macrolanguages) != 1: 1097 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages))) 1098 expect (disambiguation[ot_tag] in bcp_47_tags, 1099 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) 1100 elif ot_tag not in disambiguation: 1101 disambiguation[ot_tag] = macrolanguages[0] 1102 different_primary_tags = sorted (t for t in primary_tags if not same_tag (t, ot.from_bcp_47.get (t))) 1103 if different_primary_tags and disambiguation[ot_tag] == different_primary_tags[0] and '-' not in disambiguation[ot_tag]: 1104 del disambiguation[ot_tag] 1105 for ot_tag in disambiguation.keys (): 1106 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) 1107 1108verify_disambiguation_dict () 1109for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): 1110 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) 1111 print () 1112 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) 1113 print () 1114 1115print (' default:') 1116print (' return HB_LANGUAGE_INVALID;') 1117print (' }') 1118print ('}') 1119 1120print () 1121print ('#endif /* HB_OT_TAG_TABLE_HH */') 1122print () 1123print ('/* == End of generated table == */') 1124 1125