1# -*- coding: utf-8 -*- 2from fontTools.misc import sstruct 3from fontTools.misc.textTools import bytechr, byteord, bytesjoin, strjoin, tobytes, tostr, safeEval 4from fontTools.misc.encodingTools import getEncoding 5from fontTools.ttLib import newTable 6from . import DefaultTable 7import struct 8import logging 9 10 11log = logging.getLogger(__name__) 12 13nameRecordFormat = """ 14 > # big endian 15 platformID: H 16 platEncID: H 17 langID: H 18 nameID: H 19 length: H 20 offset: H 21""" 22 23nameRecordSize = sstruct.calcsize(nameRecordFormat) 24 25 26class table__n_a_m_e(DefaultTable.DefaultTable): 27 dependencies = ["ltag"] 28 29 def decompile(self, data, ttFont): 30 format, n, stringOffset = struct.unpack(b">HHH", data[:6]) 31 expectedStringOffset = 6 + n * nameRecordSize 32 if stringOffset != expectedStringOffset: 33 log.error( 34 "'name' table stringOffset incorrect. Expected: %s; Actual: %s", 35 expectedStringOffset, stringOffset) 36 stringData = data[stringOffset:] 37 data = data[6:] 38 self.names = [] 39 for i in range(n): 40 if len(data) < 12: 41 log.error('skipping malformed name record #%d', i) 42 continue 43 name, data = sstruct.unpack2(nameRecordFormat, data, NameRecord()) 44 name.string = stringData[name.offset:name.offset+name.length] 45 if name.offset + name.length > len(stringData): 46 log.error('skipping malformed name record #%d', i) 47 continue 48 assert len(name.string) == name.length 49 #if (name.platEncID, name.platformID) in ((0, 0), (1, 3)): 50 # if len(name.string) % 2: 51 # print "2-byte string doesn't have even length!" 52 # print name.__dict__ 53 del name.offset, name.length 54 self.names.append(name) 55 56 def compile(self, ttFont): 57 if not hasattr(self, "names"): 58 # only happens when there are NO name table entries read 59 # from the TTX file 60 self.names = [] 61 names = self.names 62 names.sort() # sort according to the spec; see NameRecord.__lt__() 63 stringData = b"" 64 format = 0 65 n = len(names) 66 stringOffset = 6 + n * sstruct.calcsize(nameRecordFormat) 67 data = struct.pack(b">HHH", format, n, stringOffset) 68 lastoffset = 0 69 done = {} # remember the data so we can reuse the "pointers" 70 for name in names: 71 string = name.toBytes() 72 if string in done: 73 name.offset, name.length = done[string] 74 else: 75 name.offset, name.length = done[string] = len(stringData), len(string) 76 stringData = bytesjoin([stringData, string]) 77 data = data + sstruct.pack(nameRecordFormat, name) 78 return data + stringData 79 80 def toXML(self, writer, ttFont): 81 for name in self.names: 82 name.toXML(writer, ttFont) 83 84 def fromXML(self, name, attrs, content, ttFont): 85 if name != "namerecord": 86 return # ignore unknown tags 87 if not hasattr(self, "names"): 88 self.names = [] 89 name = NameRecord() 90 self.names.append(name) 91 name.fromXML(name, attrs, content, ttFont) 92 93 def getName(self, nameID, platformID, platEncID, langID=None): 94 for namerecord in self.names: 95 if ( namerecord.nameID == nameID and 96 namerecord.platformID == platformID and 97 namerecord.platEncID == platEncID): 98 if langID is None or namerecord.langID == langID: 99 return namerecord 100 return None # not found 101 102 def getDebugName(self, nameID): 103 englishName = someName = None 104 for name in self.names: 105 if name.nameID != nameID: 106 continue 107 try: 108 unistr = name.toUnicode() 109 except UnicodeDecodeError: 110 continue 111 112 someName = unistr 113 if (name.platformID, name.langID) in ((1, 0), (3, 0x409)): 114 englishName = unistr 115 break 116 if englishName: 117 return englishName 118 elif someName: 119 return someName 120 else: 121 return None 122 123 def getFirstDebugName(self, nameIDs): 124 for nameID in nameIDs: 125 name = self.getDebugName(nameID) 126 if name is not None: 127 return name 128 return None 129 130 def getBestFamilyName(self): 131 # 21 = WWS Family Name 132 # 16 = Typographic Family Name 133 # 1 = Family Name 134 return self.getFirstDebugName((21, 16, 1)) 135 136 def getBestSubFamilyName(self): 137 # 22 = WWS SubFamily Name 138 # 17 = Typographic SubFamily Name 139 # 2 = SubFamily Name 140 return self.getFirstDebugName((22, 17, 2)) 141 142 def getBestFullName(self): 143 # 4 = Full Name 144 # 6 = PostScript Name 145 for nameIDs in ((21, 22), (16, 17), (1, 2), (4, ), (6, )): 146 if len(nameIDs) == 2: 147 name_fam = self.getDebugName(nameIDs[0]) 148 name_subfam = self.getDebugName(nameIDs[1]) 149 if None in [name_fam, name_subfam]: 150 continue # if any is None, skip 151 name = f"{name_fam} {name_subfam}" 152 if name_subfam.lower() == 'regular': 153 name = f"{name_fam}" 154 return name 155 else: 156 name = self.getDebugName(nameIDs[0]) 157 if name is not None: 158 return name 159 return None 160 161 def setName(self, string, nameID, platformID, platEncID, langID): 162 """ Set the 'string' for the name record identified by 'nameID', 'platformID', 163 'platEncID' and 'langID'. If a record with that nameID doesn't exist, create it 164 and append to the name table. 165 166 'string' can be of type `str` (`unicode` in PY2) or `bytes`. In the latter case, 167 it is assumed to be already encoded with the correct plaform-specific encoding 168 identified by the (platformID, platEncID, langID) triplet. A warning is issued 169 to prevent unexpected results. 170 """ 171 if not hasattr(self, 'names'): 172 self.names = [] 173 if not isinstance(string, str): 174 if isinstance(string, bytes): 175 log.warning( 176 "name string is bytes, ensure it's correctly encoded: %r", string) 177 else: 178 raise TypeError( 179 "expected unicode or bytes, found %s: %r" % ( 180 type(string).__name__, string)) 181 namerecord = self.getName(nameID, platformID, platEncID, langID) 182 if namerecord: 183 namerecord.string = string 184 else: 185 self.names.append(makeName(string, nameID, platformID, platEncID, langID)) 186 187 def removeNames(self, nameID=None, platformID=None, platEncID=None, langID=None): 188 """Remove any name records identified by the given combination of 'nameID', 189 'platformID', 'platEncID' and 'langID'. 190 """ 191 args = { 192 argName: argValue 193 for argName, argValue in ( 194 ("nameID", nameID), 195 ("platformID", platformID), 196 ("platEncID", platEncID), 197 ("langID", langID), 198 ) 199 if argValue is not None 200 } 201 if not args: 202 # no arguments, nothing to do 203 return 204 self.names = [ 205 rec for rec in self.names 206 if any( 207 argValue != getattr(rec, argName) 208 for argName, argValue in args.items() 209 ) 210 ] 211 212 def _findUnusedNameID(self, minNameID=256): 213 """Finds an unused name id. 214 215 The nameID is assigned in the range between 'minNameID' and 32767 (inclusive), 216 following the last nameID in the name table. 217 """ 218 names = getattr(self, 'names', []) 219 nameID = 1 + max([n.nameID for n in names] + [minNameID - 1]) 220 if nameID > 32767: 221 raise ValueError("nameID must be less than 32768") 222 return nameID 223 224 def findMultilingualName(self, names, windows=True, mac=True, minNameID=0): 225 """Return the name ID of an existing multilingual name that 226 matches the 'names' dictionary, or None if not found. 227 228 'names' is a dictionary with the name in multiple languages, 229 such as {'en': 'Pale', 'de': 'Blaß', 'de-CH': 'Blass'}. 230 The keys can be arbitrary IETF BCP 47 language codes; 231 the values are Unicode strings. 232 233 If 'windows' is True, the returned name ID is guaranteed 234 exist for all requested languages for platformID=3 and 235 platEncID=1. 236 If 'mac' is True, the returned name ID is guaranteed to exist 237 for all requested languages for platformID=1 and platEncID=0. 238 239 The returned name ID will not be less than the 'minNameID' 240 argument. 241 """ 242 # Gather the set of requested 243 # (string, platformID, platEncID, langID) 244 # tuples 245 reqNameSet = set() 246 for lang, name in sorted(names.items()): 247 if windows: 248 windowsName = _makeWindowsName(name, None, lang) 249 if windowsName is not None: 250 reqNameSet.add((windowsName.string, 251 windowsName.platformID, 252 windowsName.platEncID, 253 windowsName.langID)) 254 if mac: 255 macName = _makeMacName(name, None, lang) 256 if macName is not None: 257 reqNameSet.add((macName.string, 258 macName.platformID, 259 macName.platEncID, 260 macName.langID)) 261 262 # Collect matching name IDs 263 matchingNames = dict() 264 for name in self.names: 265 try: 266 key = (name.toUnicode(), name.platformID, 267 name.platEncID, name.langID) 268 except UnicodeDecodeError: 269 continue 270 if key in reqNameSet and name.nameID >= minNameID: 271 nameSet = matchingNames.setdefault(name.nameID, set()) 272 nameSet.add(key) 273 274 # Return the first name ID that defines all requested strings 275 for nameID, nameSet in sorted(matchingNames.items()): 276 if nameSet == reqNameSet: 277 return nameID 278 279 return None # not found 280 281 def addMultilingualName(self, names, ttFont=None, nameID=None, 282 windows=True, mac=True, minNameID=0): 283 """Add a multilingual name, returning its name ID 284 285 'names' is a dictionary with the name in multiple languages, 286 such as {'en': 'Pale', 'de': 'Blaß', 'de-CH': 'Blass'}. 287 The keys can be arbitrary IETF BCP 47 language codes; 288 the values are Unicode strings. 289 290 'ttFont' is the TTFont to which the names are added, or None. 291 If present, the font's 'ltag' table can get populated 292 to store exotic language codes, which allows encoding 293 names that otherwise cannot get encoded at all. 294 295 'nameID' is the name ID to be used, or None to let the library 296 find an existing set of name records that match, or pick an 297 unused name ID. 298 299 If 'windows' is True, a platformID=3 name record will be added. 300 If 'mac' is True, a platformID=1 name record will be added. 301 302 If the 'nameID' argument is None, the created nameID will not 303 be less than the 'minNameID' argument. 304 """ 305 if not hasattr(self, 'names'): 306 self.names = [] 307 if nameID is None: 308 # Reuse nameID if possible 309 nameID = self.findMultilingualName( 310 names, windows=windows, mac=mac, minNameID=minNameID) 311 if nameID is not None: 312 return nameID 313 nameID = self._findUnusedNameID() 314 # TODO: Should minimize BCP 47 language codes. 315 # https://github.com/fonttools/fonttools/issues/930 316 for lang, name in sorted(names.items()): 317 if windows: 318 windowsName = _makeWindowsName(name, nameID, lang) 319 if windowsName is not None: 320 self.names.append(windowsName) 321 else: 322 # We cannot not make a Windows name: make sure we add a 323 # Mac name as a fallback. This can happen for exotic 324 # BCP47 language tags that have no Windows language code. 325 mac = True 326 if mac: 327 macName = _makeMacName(name, nameID, lang, ttFont) 328 if macName is not None: 329 self.names.append(macName) 330 return nameID 331 332 def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255): 333 """ Add a new name record containing 'string' for each (platformID, platEncID, 334 langID) tuple specified in the 'platforms' list. 335 336 The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive), 337 following the last nameID in the name table. 338 If no 'platforms' are specified, two English name records are added, one for the 339 Macintosh (platformID=0), and one for the Windows platform (3). 340 341 The 'string' must be a Unicode string, so it can be encoded with different, 342 platform-specific encodings. 343 344 Return the new nameID. 345 """ 346 assert len(platforms) > 0, \ 347 "'platforms' must contain at least one (platformID, platEncID, langID) tuple" 348 if not hasattr(self, 'names'): 349 self.names = [] 350 if not isinstance(string, str): 351 raise TypeError( 352 "expected str, found %s: %r" % (type(string).__name__, string)) 353 nameID = self._findUnusedNameID(minNameID + 1) 354 for platformID, platEncID, langID in platforms: 355 self.names.append(makeName(string, nameID, platformID, platEncID, langID)) 356 return nameID 357 358 359def makeName(string, nameID, platformID, platEncID, langID): 360 name = NameRecord() 361 name.string, name.nameID, name.platformID, name.platEncID, name.langID = ( 362 string, nameID, platformID, platEncID, langID) 363 return name 364 365 366def _makeWindowsName(name, nameID, language): 367 """Create a NameRecord for the Microsoft Windows platform 368 369 'language' is an arbitrary IETF BCP 47 language identifier such 370 as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. If Microsoft Windows 371 does not support the desired language, the result will be None. 372 Future versions of fonttools might return a NameRecord for the 373 OpenType 'name' table format 1, but this is not implemented yet. 374 """ 375 langID = _WINDOWS_LANGUAGE_CODES.get(language.lower()) 376 if langID is not None: 377 return makeName(name, nameID, 3, 1, langID) 378 else: 379 log.warning("cannot add Windows name in language %s " 380 "because fonttools does not yet support " 381 "name table format 1" % language) 382 return None 383 384 385def _makeMacName(name, nameID, language, font=None): 386 """Create a NameRecord for Apple platforms 387 388 'language' is an arbitrary IETF BCP 47 language identifier such 389 as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. When possible, we 390 create a Macintosh NameRecord that is understood by old applications 391 (platform ID 1 and an old-style Macintosh language enum). If this 392 is not possible, we create a Unicode NameRecord (platform ID 0) 393 whose language points to the font’s 'ltag' table. The latter 394 can encode any string in any language, but legacy applications 395 might not recognize the format (in which case they will ignore 396 those names). 397 398 'font' should be the TTFont for which you want to create a name. 399 If 'font' is None, we only return NameRecords for legacy Macintosh; 400 in that case, the result will be None for names that need to 401 be encoded with an 'ltag' table. 402 403 See the section “The language identifier” in Apple’s specification: 404 https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html 405 """ 406 macLang = _MAC_LANGUAGE_CODES.get(language.lower()) 407 macScript = _MAC_LANGUAGE_TO_SCRIPT.get(macLang) 408 if macLang is not None and macScript is not None: 409 encoding = getEncoding(1, macScript, macLang, default="ascii") 410 # Check if we can actually encode this name. If we can't, 411 # for example because we have no support for the legacy 412 # encoding, or because the name string contains Unicode 413 # characters that the legacy encoding cannot represent, 414 # we fall back to encoding the name in Unicode and put 415 # the language tag into the ltag table. 416 try: 417 _ = tobytes(name, encoding, errors="strict") 418 return makeName(name, nameID, 1, macScript, macLang) 419 except UnicodeEncodeError: 420 pass 421 if font is not None: 422 ltag = font.tables.get("ltag") 423 if ltag is None: 424 ltag = font["ltag"] = newTable("ltag") 425 # 0 = Unicode; 4 = “Unicode 2.0 or later semantics (non-BMP characters allowed)” 426 # “The preferred platform-specific code for Unicode would be 3 or 4.” 427 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html 428 return makeName(name, nameID, 0, 4, ltag.addTag(language)) 429 else: 430 log.warning("cannot store language %s into 'ltag' table " 431 "without having access to the TTFont object" % 432 language) 433 return None 434 435 436class NameRecord(object): 437 438 def getEncoding(self, default='ascii'): 439 """Returns the Python encoding name for this name entry based on its platformID, 440 platEncID, and langID. If encoding for these values is not known, by default 441 'ascii' is returned. That can be overriden by passing a value to the default 442 argument. 443 """ 444 return getEncoding(self.platformID, self.platEncID, self.langID, default) 445 446 def encodingIsUnicodeCompatible(self): 447 return self.getEncoding(None) in ['utf_16_be', 'ucs2be', 'ascii', 'latin1'] 448 449 def __str__(self): 450 return self.toStr(errors='backslashreplace') 451 452 def isUnicode(self): 453 return (self.platformID == 0 or 454 (self.platformID == 3 and self.platEncID in [0, 1, 10])) 455 456 def toUnicode(self, errors='strict'): 457 """ 458 If self.string is a Unicode string, return it; otherwise try decoding the 459 bytes in self.string to a Unicode string using the encoding of this 460 entry as returned by self.getEncoding(); Note that self.getEncoding() 461 returns 'ascii' if the encoding is unknown to the library. 462 463 Certain heuristics are performed to recover data from bytes that are 464 ill-formed in the chosen encoding, or that otherwise look misencoded 465 (mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE 466 but marked otherwise). If the bytes are ill-formed and the heuristics fail, 467 the error is handled according to the errors parameter to this function, which is 468 passed to the underlying decode() function; by default it throws a 469 UnicodeDecodeError exception. 470 471 Note: The mentioned heuristics mean that roundtripping a font to XML and back 472 to binary might recover some misencoded data whereas just loading the font 473 and saving it back will not change them. 474 """ 475 def isascii(b): 476 return (b >= 0x20 and b <= 0x7E) or b in [0x09, 0x0A, 0x0D] 477 encoding = self.getEncoding() 478 string = self.string 479 480 if isinstance(string, bytes) and encoding == 'utf_16_be' and len(string) % 2 == 1: 481 # Recover badly encoded UTF-16 strings that have an odd number of bytes: 482 # - If the last byte is zero, drop it. Otherwise, 483 # - If all the odd bytes are zero and all the even bytes are ASCII, 484 # prepend one zero byte. Otherwise, 485 # - If first byte is zero and all other bytes are ASCII, insert zero 486 # bytes between consecutive ASCII bytes. 487 # 488 # (Yes, I've seen all of these in the wild... sigh) 489 if byteord(string[-1]) == 0: 490 string = string[:-1] 491 elif all(byteord(b) == 0 if i % 2 else isascii(byteord(b)) for i,b in enumerate(string)): 492 string = b'\0' + string 493 elif byteord(string[0]) == 0 and all(isascii(byteord(b)) for b in string[1:]): 494 string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:]) 495 496 string = tostr(string, encoding=encoding, errors=errors) 497 498 # If decoded strings still looks like UTF-16BE, it suggests a double-encoding. 499 # Fix it up. 500 if all(ord(c) == 0 if i % 2 == 0 else isascii(ord(c)) for i,c in enumerate(string)): 501 # If string claims to be Mac encoding, but looks like UTF-16BE with ASCII text, 502 # narrow it down. 503 string = ''.join(c for c in string[1::2]) 504 505 return string 506 507 def toBytes(self, errors='strict'): 508 """ If self.string is a bytes object, return it; otherwise try encoding 509 the Unicode string in self.string to bytes using the encoding of this 510 entry as returned by self.getEncoding(); Note that self.getEncoding() 511 returns 'ascii' if the encoding is unknown to the library. 512 513 If the Unicode string cannot be encoded to bytes in the chosen encoding, 514 the error is handled according to the errors parameter to this function, 515 which is passed to the underlying encode() function; by default it throws a 516 UnicodeEncodeError exception. 517 """ 518 return tobytes(self.string, encoding=self.getEncoding(), errors=errors) 519 520 toStr = toUnicode 521 522 def toXML(self, writer, ttFont): 523 try: 524 unistr = self.toUnicode() 525 except UnicodeDecodeError: 526 unistr = None 527 attrs = [ 528 ("nameID", self.nameID), 529 ("platformID", self.platformID), 530 ("platEncID", self.platEncID), 531 ("langID", hex(self.langID)), 532 ] 533 534 if unistr is None or not self.encodingIsUnicodeCompatible(): 535 attrs.append(("unicode", unistr is not None)) 536 537 writer.begintag("namerecord", attrs) 538 writer.newline() 539 if unistr is not None: 540 writer.write(unistr) 541 else: 542 writer.write8bit(self.string) 543 writer.newline() 544 writer.endtag("namerecord") 545 writer.newline() 546 547 def fromXML(self, name, attrs, content, ttFont): 548 self.nameID = safeEval(attrs["nameID"]) 549 self.platformID = safeEval(attrs["platformID"]) 550 self.platEncID = safeEval(attrs["platEncID"]) 551 self.langID = safeEval(attrs["langID"]) 552 s = strjoin(content).strip() 553 encoding = self.getEncoding() 554 if self.encodingIsUnicodeCompatible() or safeEval(attrs.get("unicode", "False")): 555 self.string = s.encode(encoding) 556 else: 557 # This is the inverse of write8bit... 558 self.string = s.encode("latin1") 559 560 def __lt__(self, other): 561 if type(self) != type(other): 562 return NotImplemented 563 564 try: 565 # implemented so that list.sort() sorts according to the spec. 566 selfTuple = ( 567 self.platformID, 568 self.platEncID, 569 self.langID, 570 self.nameID, 571 self.toBytes(), 572 ) 573 otherTuple = ( 574 other.platformID, 575 other.platEncID, 576 other.langID, 577 other.nameID, 578 other.toBytes(), 579 ) 580 return selfTuple < otherTuple 581 except (UnicodeEncodeError, AttributeError): 582 # This can only happen for 583 # 1) an object that is not a NameRecord, or 584 # 2) an unlikely incomplete NameRecord object which has not been 585 # fully populated, or 586 # 3) when all IDs are identical but the strings can't be encoded 587 # for their platform encoding. 588 # In all cases it is best to return NotImplemented. 589 return NotImplemented 590 591 def __repr__(self): 592 return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % ( 593 self.nameID, self.platformID, self.langID) 594 595 596# Windows language ID → IETF BCP-47 language tag 597# 598# While Microsoft indicates a region/country for all its language 599# IDs, we follow Unicode practice by omitting “most likely subtags” 600# as per Unicode CLDR. For example, English is simply “en” and not 601# “en-Latn” because according to Unicode, the default script 602# for English is Latin. 603# 604# http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html 605# http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 606_WINDOWS_LANGUAGES = { 607 0x0436: 'af', 608 0x041C: 'sq', 609 0x0484: 'gsw', 610 0x045E: 'am', 611 0x1401: 'ar-DZ', 612 0x3C01: 'ar-BH', 613 0x0C01: 'ar', 614 0x0801: 'ar-IQ', 615 0x2C01: 'ar-JO', 616 0x3401: 'ar-KW', 617 0x3001: 'ar-LB', 618 0x1001: 'ar-LY', 619 0x1801: 'ary', 620 0x2001: 'ar-OM', 621 0x4001: 'ar-QA', 622 0x0401: 'ar-SA', 623 0x2801: 'ar-SY', 624 0x1C01: 'aeb', 625 0x3801: 'ar-AE', 626 0x2401: 'ar-YE', 627 0x042B: 'hy', 628 0x044D: 'as', 629 0x082C: 'az-Cyrl', 630 0x042C: 'az', 631 0x046D: 'ba', 632 0x042D: 'eu', 633 0x0423: 'be', 634 0x0845: 'bn', 635 0x0445: 'bn-IN', 636 0x201A: 'bs-Cyrl', 637 0x141A: 'bs', 638 0x047E: 'br', 639 0x0402: 'bg', 640 0x0403: 'ca', 641 0x0C04: 'zh-HK', 642 0x1404: 'zh-MO', 643 0x0804: 'zh', 644 0x1004: 'zh-SG', 645 0x0404: 'zh-TW', 646 0x0483: 'co', 647 0x041A: 'hr', 648 0x101A: 'hr-BA', 649 0x0405: 'cs', 650 0x0406: 'da', 651 0x048C: 'prs', 652 0x0465: 'dv', 653 0x0813: 'nl-BE', 654 0x0413: 'nl', 655 0x0C09: 'en-AU', 656 0x2809: 'en-BZ', 657 0x1009: 'en-CA', 658 0x2409: 'en-029', 659 0x4009: 'en-IN', 660 0x1809: 'en-IE', 661 0x2009: 'en-JM', 662 0x4409: 'en-MY', 663 0x1409: 'en-NZ', 664 0x3409: 'en-PH', 665 0x4809: 'en-SG', 666 0x1C09: 'en-ZA', 667 0x2C09: 'en-TT', 668 0x0809: 'en-GB', 669 0x0409: 'en', 670 0x3009: 'en-ZW', 671 0x0425: 'et', 672 0x0438: 'fo', 673 0x0464: 'fil', 674 0x040B: 'fi', 675 0x080C: 'fr-BE', 676 0x0C0C: 'fr-CA', 677 0x040C: 'fr', 678 0x140C: 'fr-LU', 679 0x180C: 'fr-MC', 680 0x100C: 'fr-CH', 681 0x0462: 'fy', 682 0x0456: 'gl', 683 0x0437: 'ka', 684 0x0C07: 'de-AT', 685 0x0407: 'de', 686 0x1407: 'de-LI', 687 0x1007: 'de-LU', 688 0x0807: 'de-CH', 689 0x0408: 'el', 690 0x046F: 'kl', 691 0x0447: 'gu', 692 0x0468: 'ha', 693 0x040D: 'he', 694 0x0439: 'hi', 695 0x040E: 'hu', 696 0x040F: 'is', 697 0x0470: 'ig', 698 0x0421: 'id', 699 0x045D: 'iu', 700 0x085D: 'iu-Latn', 701 0x083C: 'ga', 702 0x0434: 'xh', 703 0x0435: 'zu', 704 0x0410: 'it', 705 0x0810: 'it-CH', 706 0x0411: 'ja', 707 0x044B: 'kn', 708 0x043F: 'kk', 709 0x0453: 'km', 710 0x0486: 'quc', 711 0x0487: 'rw', 712 0x0441: 'sw', 713 0x0457: 'kok', 714 0x0412: 'ko', 715 0x0440: 'ky', 716 0x0454: 'lo', 717 0x0426: 'lv', 718 0x0427: 'lt', 719 0x082E: 'dsb', 720 0x046E: 'lb', 721 0x042F: 'mk', 722 0x083E: 'ms-BN', 723 0x043E: 'ms', 724 0x044C: 'ml', 725 0x043A: 'mt', 726 0x0481: 'mi', 727 0x047A: 'arn', 728 0x044E: 'mr', 729 0x047C: 'moh', 730 0x0450: 'mn', 731 0x0850: 'mn-CN', 732 0x0461: 'ne', 733 0x0414: 'nb', 734 0x0814: 'nn', 735 0x0482: 'oc', 736 0x0448: 'or', 737 0x0463: 'ps', 738 0x0415: 'pl', 739 0x0416: 'pt', 740 0x0816: 'pt-PT', 741 0x0446: 'pa', 742 0x046B: 'qu-BO', 743 0x086B: 'qu-EC', 744 0x0C6B: 'qu', 745 0x0418: 'ro', 746 0x0417: 'rm', 747 0x0419: 'ru', 748 0x243B: 'smn', 749 0x103B: 'smj-NO', 750 0x143B: 'smj', 751 0x0C3B: 'se-FI', 752 0x043B: 'se', 753 0x083B: 'se-SE', 754 0x203B: 'sms', 755 0x183B: 'sma-NO', 756 0x1C3B: 'sms', 757 0x044F: 'sa', 758 0x1C1A: 'sr-Cyrl-BA', 759 0x0C1A: 'sr', 760 0x181A: 'sr-Latn-BA', 761 0x081A: 'sr-Latn', 762 0x046C: 'nso', 763 0x0432: 'tn', 764 0x045B: 'si', 765 0x041B: 'sk', 766 0x0424: 'sl', 767 0x2C0A: 'es-AR', 768 0x400A: 'es-BO', 769 0x340A: 'es-CL', 770 0x240A: 'es-CO', 771 0x140A: 'es-CR', 772 0x1C0A: 'es-DO', 773 0x300A: 'es-EC', 774 0x440A: 'es-SV', 775 0x100A: 'es-GT', 776 0x480A: 'es-HN', 777 0x080A: 'es-MX', 778 0x4C0A: 'es-NI', 779 0x180A: 'es-PA', 780 0x3C0A: 'es-PY', 781 0x280A: 'es-PE', 782 0x500A: 'es-PR', 783 784 # Microsoft has defined two different language codes for 785 # “Spanish with modern sorting” and “Spanish with traditional 786 # sorting”. This makes sense for collation APIs, and it would be 787 # possible to express this in BCP 47 language tags via Unicode 788 # extensions (eg., “es-u-co-trad” is “Spanish with traditional 789 # sorting”). However, for storing names in fonts, this distinction 790 # does not make sense, so we use “es” in both cases. 791 0x0C0A: 'es', 792 0x040A: 'es', 793 794 0x540A: 'es-US', 795 0x380A: 'es-UY', 796 0x200A: 'es-VE', 797 0x081D: 'sv-FI', 798 0x041D: 'sv', 799 0x045A: 'syr', 800 0x0428: 'tg', 801 0x085F: 'tzm', 802 0x0449: 'ta', 803 0x0444: 'tt', 804 0x044A: 'te', 805 0x041E: 'th', 806 0x0451: 'bo', 807 0x041F: 'tr', 808 0x0442: 'tk', 809 0x0480: 'ug', 810 0x0422: 'uk', 811 0x042E: 'hsb', 812 0x0420: 'ur', 813 0x0843: 'uz-Cyrl', 814 0x0443: 'uz', 815 0x042A: 'vi', 816 0x0452: 'cy', 817 0x0488: 'wo', 818 0x0485: 'sah', 819 0x0478: 'ii', 820 0x046A: 'yo', 821} 822 823 824_MAC_LANGUAGES = { 825 0: 'en', 826 1: 'fr', 827 2: 'de', 828 3: 'it', 829 4: 'nl', 830 5: 'sv', 831 6: 'es', 832 7: 'da', 833 8: 'pt', 834 9: 'no', 835 10: 'he', 836 11: 'ja', 837 12: 'ar', 838 13: 'fi', 839 14: 'el', 840 15: 'is', 841 16: 'mt', 842 17: 'tr', 843 18: 'hr', 844 19: 'zh-Hant', 845 20: 'ur', 846 21: 'hi', 847 22: 'th', 848 23: 'ko', 849 24: 'lt', 850 25: 'pl', 851 26: 'hu', 852 27: 'es', 853 28: 'lv', 854 29: 'se', 855 30: 'fo', 856 31: 'fa', 857 32: 'ru', 858 33: 'zh', 859 34: 'nl-BE', 860 35: 'ga', 861 36: 'sq', 862 37: 'ro', 863 38: 'cz', 864 39: 'sk', 865 40: 'sl', 866 41: 'yi', 867 42: 'sr', 868 43: 'mk', 869 44: 'bg', 870 45: 'uk', 871 46: 'be', 872 47: 'uz', 873 48: 'kk', 874 49: 'az-Cyrl', 875 50: 'az-Arab', 876 51: 'hy', 877 52: 'ka', 878 53: 'mo', 879 54: 'ky', 880 55: 'tg', 881 56: 'tk', 882 57: 'mn-CN', 883 58: 'mn', 884 59: 'ps', 885 60: 'ks', 886 61: 'ku', 887 62: 'sd', 888 63: 'bo', 889 64: 'ne', 890 65: 'sa', 891 66: 'mr', 892 67: 'bn', 893 68: 'as', 894 69: 'gu', 895 70: 'pa', 896 71: 'or', 897 72: 'ml', 898 73: 'kn', 899 74: 'ta', 900 75: 'te', 901 76: 'si', 902 77: 'my', 903 78: 'km', 904 79: 'lo', 905 80: 'vi', 906 81: 'id', 907 82: 'tl', 908 83: 'ms', 909 84: 'ms-Arab', 910 85: 'am', 911 86: 'ti', 912 87: 'om', 913 88: 'so', 914 89: 'sw', 915 90: 'rw', 916 91: 'rn', 917 92: 'ny', 918 93: 'mg', 919 94: 'eo', 920 128: 'cy', 921 129: 'eu', 922 130: 'ca', 923 131: 'la', 924 132: 'qu', 925 133: 'gn', 926 134: 'ay', 927 135: 'tt', 928 136: 'ug', 929 137: 'dz', 930 138: 'jv', 931 139: 'su', 932 140: 'gl', 933 141: 'af', 934 142: 'br', 935 143: 'iu', 936 144: 'gd', 937 145: 'gv', 938 146: 'ga', 939 147: 'to', 940 148: 'el-polyton', 941 149: 'kl', 942 150: 'az', 943 151: 'nn', 944} 945 946 947_WINDOWS_LANGUAGE_CODES = {lang.lower(): code for code, lang in _WINDOWS_LANGUAGES.items()} 948_MAC_LANGUAGE_CODES = {lang.lower(): code for code, lang in _MAC_LANGUAGES.items()} 949 950 951# MacOS language ID → MacOS script ID 952# 953# Note that the script ID is not sufficient to determine what encoding 954# to use in TrueType files. For some languages, MacOS used a modification 955# of a mainstream script. For example, an Icelandic name would be stored 956# with smRoman in the TrueType naming table, but the actual encoding 957# is a special Icelandic version of the normal Macintosh Roman encoding. 958# As another example, Inuktitut uses an 8-bit encoding for Canadian Aboriginal 959# Syllables but MacOS had run out of available script codes, so this was 960# done as a (pretty radical) “modification” of Ethiopic. 961# 962# http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/Readme.txt 963_MAC_LANGUAGE_TO_SCRIPT = { 964 0: 0, # langEnglish → smRoman 965 1: 0, # langFrench → smRoman 966 2: 0, # langGerman → smRoman 967 3: 0, # langItalian → smRoman 968 4: 0, # langDutch → smRoman 969 5: 0, # langSwedish → smRoman 970 6: 0, # langSpanish → smRoman 971 7: 0, # langDanish → smRoman 972 8: 0, # langPortuguese → smRoman 973 9: 0, # langNorwegian → smRoman 974 10: 5, # langHebrew → smHebrew 975 11: 1, # langJapanese → smJapanese 976 12: 4, # langArabic → smArabic 977 13: 0, # langFinnish → smRoman 978 14: 6, # langGreek → smGreek 979 15: 0, # langIcelandic → smRoman (modified) 980 16: 0, # langMaltese → smRoman 981 17: 0, # langTurkish → smRoman (modified) 982 18: 0, # langCroatian → smRoman (modified) 983 19: 2, # langTradChinese → smTradChinese 984 20: 4, # langUrdu → smArabic 985 21: 9, # langHindi → smDevanagari 986 22: 21, # langThai → smThai 987 23: 3, # langKorean → smKorean 988 24: 29, # langLithuanian → smCentralEuroRoman 989 25: 29, # langPolish → smCentralEuroRoman 990 26: 29, # langHungarian → smCentralEuroRoman 991 27: 29, # langEstonian → smCentralEuroRoman 992 28: 29, # langLatvian → smCentralEuroRoman 993 29: 0, # langSami → smRoman 994 30: 0, # langFaroese → smRoman (modified) 995 31: 4, # langFarsi → smArabic (modified) 996 32: 7, # langRussian → smCyrillic 997 33: 25, # langSimpChinese → smSimpChinese 998 34: 0, # langFlemish → smRoman 999 35: 0, # langIrishGaelic → smRoman (modified) 1000 36: 0, # langAlbanian → smRoman 1001 37: 0, # langRomanian → smRoman (modified) 1002 38: 29, # langCzech → smCentralEuroRoman 1003 39: 29, # langSlovak → smCentralEuroRoman 1004 40: 0, # langSlovenian → smRoman (modified) 1005 41: 5, # langYiddish → smHebrew 1006 42: 7, # langSerbian → smCyrillic 1007 43: 7, # langMacedonian → smCyrillic 1008 44: 7, # langBulgarian → smCyrillic 1009 45: 7, # langUkrainian → smCyrillic (modified) 1010 46: 7, # langByelorussian → smCyrillic 1011 47: 7, # langUzbek → smCyrillic 1012 48: 7, # langKazakh → smCyrillic 1013 49: 7, # langAzerbaijani → smCyrillic 1014 50: 4, # langAzerbaijanAr → smArabic 1015 51: 24, # langArmenian → smArmenian 1016 52: 23, # langGeorgian → smGeorgian 1017 53: 7, # langMoldavian → smCyrillic 1018 54: 7, # langKirghiz → smCyrillic 1019 55: 7, # langTajiki → smCyrillic 1020 56: 7, # langTurkmen → smCyrillic 1021 57: 27, # langMongolian → smMongolian 1022 58: 7, # langMongolianCyr → smCyrillic 1023 59: 4, # langPashto → smArabic 1024 60: 4, # langKurdish → smArabic 1025 61: 4, # langKashmiri → smArabic 1026 62: 4, # langSindhi → smArabic 1027 63: 26, # langTibetan → smTibetan 1028 64: 9, # langNepali → smDevanagari 1029 65: 9, # langSanskrit → smDevanagari 1030 66: 9, # langMarathi → smDevanagari 1031 67: 13, # langBengali → smBengali 1032 68: 13, # langAssamese → smBengali 1033 69: 11, # langGujarati → smGujarati 1034 70: 10, # langPunjabi → smGurmukhi 1035 71: 12, # langOriya → smOriya 1036 72: 17, # langMalayalam → smMalayalam 1037 73: 16, # langKannada → smKannada 1038 74: 14, # langTamil → smTamil 1039 75: 15, # langTelugu → smTelugu 1040 76: 18, # langSinhalese → smSinhalese 1041 77: 19, # langBurmese → smBurmese 1042 78: 20, # langKhmer → smKhmer 1043 79: 22, # langLao → smLao 1044 80: 30, # langVietnamese → smVietnamese 1045 81: 0, # langIndonesian → smRoman 1046 82: 0, # langTagalog → smRoman 1047 83: 0, # langMalayRoman → smRoman 1048 84: 4, # langMalayArabic → smArabic 1049 85: 28, # langAmharic → smEthiopic 1050 86: 28, # langTigrinya → smEthiopic 1051 87: 28, # langOromo → smEthiopic 1052 88: 0, # langSomali → smRoman 1053 89: 0, # langSwahili → smRoman 1054 90: 0, # langKinyarwanda → smRoman 1055 91: 0, # langRundi → smRoman 1056 92: 0, # langNyanja → smRoman 1057 93: 0, # langMalagasy → smRoman 1058 94: 0, # langEsperanto → smRoman 1059 128: 0, # langWelsh → smRoman (modified) 1060 129: 0, # langBasque → smRoman 1061 130: 0, # langCatalan → smRoman 1062 131: 0, # langLatin → smRoman 1063 132: 0, # langQuechua → smRoman 1064 133: 0, # langGuarani → smRoman 1065 134: 0, # langAymara → smRoman 1066 135: 7, # langTatar → smCyrillic 1067 136: 4, # langUighur → smArabic 1068 137: 26, # langDzongkha → smTibetan 1069 138: 0, # langJavaneseRom → smRoman 1070 139: 0, # langSundaneseRom → smRoman 1071 140: 0, # langGalician → smRoman 1072 141: 0, # langAfrikaans → smRoman 1073 142: 0, # langBreton → smRoman (modified) 1074 143: 28, # langInuktitut → smEthiopic (modified) 1075 144: 0, # langScottishGaelic → smRoman (modified) 1076 145: 0, # langManxGaelic → smRoman (modified) 1077 146: 0, # langIrishGaelicScript → smRoman (modified) 1078 147: 0, # langTongan → smRoman 1079 148: 6, # langGreekAncient → smRoman 1080 149: 0, # langGreenlandic → smRoman 1081 150: 0, # langAzerbaijanRoman → smRoman 1082 151: 0, # langNynorsk → smRoman 1083} 1084