1# -*- coding: utf-8 -*- 2from fontTools.misc.py23 import bytechr, byteord, bytesjoin, strjoin, tobytes, tostr 3from fontTools.misc import sstruct 4from fontTools.misc.textTools import safeEval 5from fontTools.misc.encodingTools import getEncoding 6from fontTools.ttLib import newTable 7from . import DefaultTable 8import struct 9import logging 10 11 12log = logging.getLogger(__name__) 13 14nameRecordFormat = """ 15 > # big endian 16 platformID: H 17 platEncID: H 18 langID: H 19 nameID: H 20 length: H 21 offset: H 22""" 23 24nameRecordSize = sstruct.calcsize(nameRecordFormat) 25 26 27class table__n_a_m_e(DefaultTable.DefaultTable): 28 dependencies = ["ltag"] 29 30 def decompile(self, data, ttFont): 31 format, n, stringOffset = struct.unpack(b">HHH", data[:6]) 32 expectedStringOffset = 6 + n * nameRecordSize 33 if stringOffset != expectedStringOffset: 34 log.error( 35 "'name' table stringOffset incorrect. Expected: %s; Actual: %s", 36 expectedStringOffset, stringOffset) 37 stringData = data[stringOffset:] 38 data = data[6:] 39 self.names = [] 40 for i in range(n): 41 if len(data) < 12: 42 log.error('skipping malformed name record #%d', i) 43 continue 44 name, data = sstruct.unpack2(nameRecordFormat, data, NameRecord()) 45 name.string = stringData[name.offset:name.offset+name.length] 46 if name.offset + name.length > len(stringData): 47 log.error('skipping malformed name record #%d', i) 48 continue 49 assert len(name.string) == name.length 50 #if (name.platEncID, name.platformID) in ((0, 0), (1, 3)): 51 # if len(name.string) % 2: 52 # print "2-byte string doesn't have even length!" 53 # print name.__dict__ 54 del name.offset, name.length 55 self.names.append(name) 56 57 def compile(self, ttFont): 58 if not hasattr(self, "names"): 59 # only happens when there are NO name table entries read 60 # from the TTX file 61 self.names = [] 62 names = self.names 63 names.sort() # sort according to the spec; see NameRecord.__lt__() 64 stringData = b"" 65 format = 0 66 n = len(names) 67 stringOffset = 6 + n * sstruct.calcsize(nameRecordFormat) 68 data = struct.pack(b">HHH", format, n, stringOffset) 69 lastoffset = 0 70 done = {} # remember the data so we can reuse the "pointers" 71 for name in names: 72 string = name.toBytes() 73 if string in done: 74 name.offset, name.length = done[string] 75 else: 76 name.offset, name.length = done[string] = len(stringData), len(string) 77 stringData = bytesjoin([stringData, string]) 78 data = data + sstruct.pack(nameRecordFormat, name) 79 return data + stringData 80 81 def toXML(self, writer, ttFont): 82 for name in self.names: 83 name.toXML(writer, ttFont) 84 85 def fromXML(self, name, attrs, content, ttFont): 86 if name != "namerecord": 87 return # ignore unknown tags 88 if not hasattr(self, "names"): 89 self.names = [] 90 name = NameRecord() 91 self.names.append(name) 92 name.fromXML(name, attrs, content, ttFont) 93 94 def getName(self, nameID, platformID, platEncID, langID=None): 95 for namerecord in self.names: 96 if ( namerecord.nameID == nameID and 97 namerecord.platformID == platformID and 98 namerecord.platEncID == platEncID): 99 if langID is None or namerecord.langID == langID: 100 return namerecord 101 return None # not found 102 103 def getDebugName(self, nameID): 104 englishName = someName = None 105 for name in self.names: 106 if name.nameID != nameID: 107 continue 108 try: 109 unistr = name.toUnicode() 110 except UnicodeDecodeError: 111 continue 112 113 someName = unistr 114 if (name.platformID, name.langID) in ((1, 0), (3, 0x409)): 115 englishName = unistr 116 break 117 if englishName: 118 return englishName 119 elif someName: 120 return someName 121 else: 122 return None 123 124 def setName(self, string, nameID, platformID, platEncID, langID): 125 """ Set the 'string' for the name record identified by 'nameID', 'platformID', 126 'platEncID' and 'langID'. If a record with that nameID doesn't exist, create it 127 and append to the name table. 128 129 'string' can be of type `str` (`unicode` in PY2) or `bytes`. In the latter case, 130 it is assumed to be already encoded with the correct plaform-specific encoding 131 identified by the (platformID, platEncID, langID) triplet. A warning is issued 132 to prevent unexpected results. 133 """ 134 if not hasattr(self, 'names'): 135 self.names = [] 136 if not isinstance(string, str): 137 if isinstance(string, bytes): 138 log.warning( 139 "name string is bytes, ensure it's correctly encoded: %r", string) 140 else: 141 raise TypeError( 142 "expected unicode or bytes, found %s: %r" % ( 143 type(string).__name__, string)) 144 namerecord = self.getName(nameID, platformID, platEncID, langID) 145 if namerecord: 146 namerecord.string = string 147 else: 148 self.names.append(makeName(string, nameID, platformID, platEncID, langID)) 149 150 def removeNames(self, nameID=None, platformID=None, platEncID=None, langID=None): 151 """Remove any name records identified by the given combination of 'nameID', 152 'platformID', 'platEncID' and 'langID'. 153 """ 154 args = { 155 argName: argValue 156 for argName, argValue in ( 157 ("nameID", nameID), 158 ("platformID", platformID), 159 ("platEncID", platEncID), 160 ("langID", langID), 161 ) 162 if argValue is not None 163 } 164 if not args: 165 # no arguments, nothing to do 166 return 167 self.names = [ 168 rec for rec in self.names 169 if any( 170 argValue != getattr(rec, argName) 171 for argName, argValue in args.items() 172 ) 173 ] 174 175 def _findUnusedNameID(self, minNameID=256): 176 """Finds an unused name id. 177 178 The nameID is assigned in the range between 'minNameID' and 32767 (inclusive), 179 following the last nameID in the name table. 180 """ 181 names = getattr(self, 'names', []) 182 nameID = 1 + max([n.nameID for n in names] + [minNameID - 1]) 183 if nameID > 32767: 184 raise ValueError("nameID must be less than 32768") 185 return nameID 186 187 def findMultilingualName(self, names, windows=True, mac=True, minNameID=0): 188 """Return the name ID of an existing multilingual name that 189 matches the 'names' dictionary, or None if not found. 190 191 'names' is a dictionary with the name in multiple languages, 192 such as {'en': 'Pale', 'de': 'Blaß', 'de-CH': 'Blass'}. 193 The keys can be arbitrary IETF BCP 47 language codes; 194 the values are Unicode strings. 195 196 If 'windows' is True, the returned name ID is guaranteed 197 exist for all requested languages for platformID=3 and 198 platEncID=1. 199 If 'mac' is True, the returned name ID is guaranteed to exist 200 for all requested languages for platformID=1 and platEncID=0. 201 202 The returned name ID will not be less than the 'minNameID' 203 argument. 204 """ 205 # Gather the set of requested 206 # (string, platformID, platEncID, langID) 207 # tuples 208 reqNameSet = set() 209 for lang, name in sorted(names.items()): 210 if windows: 211 windowsName = _makeWindowsName(name, None, lang) 212 if windowsName is not None: 213 reqNameSet.add((windowsName.string, 214 windowsName.platformID, 215 windowsName.platEncID, 216 windowsName.langID)) 217 if mac: 218 macName = _makeMacName(name, None, lang) 219 if macName is not None: 220 reqNameSet.add((macName.string, 221 macName.platformID, 222 macName.platEncID, 223 macName.langID)) 224 225 # Collect matching name IDs 226 matchingNames = dict() 227 for name in self.names: 228 try: 229 key = (name.toUnicode(), name.platformID, 230 name.platEncID, name.langID) 231 except UnicodeDecodeError: 232 continue 233 if key in reqNameSet and name.nameID >= minNameID: 234 nameSet = matchingNames.setdefault(name.nameID, set()) 235 nameSet.add(key) 236 237 # Return the first name ID that defines all requested strings 238 for nameID, nameSet in sorted(matchingNames.items()): 239 if nameSet == reqNameSet: 240 return nameID 241 242 return None # not found 243 244 def addMultilingualName(self, names, ttFont=None, nameID=None, 245 windows=True, mac=True, minNameID=0): 246 """Add a multilingual name, returning its name ID 247 248 'names' is a dictionary with the name in multiple languages, 249 such as {'en': 'Pale', 'de': 'Blaß', 'de-CH': 'Blass'}. 250 The keys can be arbitrary IETF BCP 47 language codes; 251 the values are Unicode strings. 252 253 'ttFont' is the TTFont to which the names are added, or None. 254 If present, the font's 'ltag' table can get populated 255 to store exotic language codes, which allows encoding 256 names that otherwise cannot get encoded at all. 257 258 'nameID' is the name ID to be used, or None to let the library 259 find an existing set of name records that match, or pick an 260 unused name ID. 261 262 If 'windows' is True, a platformID=3 name record will be added. 263 If 'mac' is True, a platformID=1 name record will be added. 264 265 If the 'nameID' argument is None, the created nameID will not 266 be less than the 'minNameID' argument. 267 """ 268 if not hasattr(self, 'names'): 269 self.names = [] 270 if nameID is None: 271 # Reuse nameID if possible 272 nameID = self.findMultilingualName( 273 names, windows=windows, mac=mac, minNameID=minNameID) 274 if nameID is not None: 275 return nameID 276 nameID = self._findUnusedNameID() 277 # TODO: Should minimize BCP 47 language codes. 278 # https://github.com/fonttools/fonttools/issues/930 279 for lang, name in sorted(names.items()): 280 if windows: 281 windowsName = _makeWindowsName(name, nameID, lang) 282 if windowsName is not None: 283 self.names.append(windowsName) 284 else: 285 # We cannot not make a Windows name: make sure we add a 286 # Mac name as a fallback. This can happen for exotic 287 # BCP47 language tags that have no Windows language code. 288 mac = True 289 if mac: 290 macName = _makeMacName(name, nameID, lang, ttFont) 291 if macName is not None: 292 self.names.append(macName) 293 return nameID 294 295 def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255): 296 """ Add a new name record containing 'string' for each (platformID, platEncID, 297 langID) tuple specified in the 'platforms' list. 298 299 The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive), 300 following the last nameID in the name table. 301 If no 'platforms' are specified, two English name records are added, one for the 302 Macintosh (platformID=0), and one for the Windows platform (3). 303 304 The 'string' must be a Unicode string, so it can be encoded with different, 305 platform-specific encodings. 306 307 Return the new nameID. 308 """ 309 assert len(platforms) > 0, \ 310 "'platforms' must contain at least one (platformID, platEncID, langID) tuple" 311 if not hasattr(self, 'names'): 312 self.names = [] 313 if not isinstance(string, str): 314 raise TypeError( 315 "expected str, found %s: %r" % (type(string).__name__, string)) 316 nameID = self._findUnusedNameID(minNameID + 1) 317 for platformID, platEncID, langID in platforms: 318 self.names.append(makeName(string, nameID, platformID, platEncID, langID)) 319 return nameID 320 321 322def makeName(string, nameID, platformID, platEncID, langID): 323 name = NameRecord() 324 name.string, name.nameID, name.platformID, name.platEncID, name.langID = ( 325 string, nameID, platformID, platEncID, langID) 326 return name 327 328 329def _makeWindowsName(name, nameID, language): 330 """Create a NameRecord for the Microsoft Windows platform 331 332 'language' is an arbitrary IETF BCP 47 language identifier such 333 as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. If Microsoft Windows 334 does not support the desired language, the result will be None. 335 Future versions of fonttools might return a NameRecord for the 336 OpenType 'name' table format 1, but this is not implemented yet. 337 """ 338 langID = _WINDOWS_LANGUAGE_CODES.get(language.lower()) 339 if langID is not None: 340 return makeName(name, nameID, 3, 1, langID) 341 else: 342 log.warning("cannot add Windows name in language %s " 343 "because fonttools does not yet support " 344 "name table format 1" % language) 345 return None 346 347 348def _makeMacName(name, nameID, language, font=None): 349 """Create a NameRecord for Apple platforms 350 351 'language' is an arbitrary IETF BCP 47 language identifier such 352 as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. When possible, we 353 create a Macintosh NameRecord that is understood by old applications 354 (platform ID 1 and an old-style Macintosh language enum). If this 355 is not possible, we create a Unicode NameRecord (platform ID 0) 356 whose language points to the font’s 'ltag' table. The latter 357 can encode any string in any language, but legacy applications 358 might not recognize the format (in which case they will ignore 359 those names). 360 361 'font' should be the TTFont for which you want to create a name. 362 If 'font' is None, we only return NameRecords for legacy Macintosh; 363 in that case, the result will be None for names that need to 364 be encoded with an 'ltag' table. 365 366 See the section “The language identifier” in Apple’s specification: 367 https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html 368 """ 369 macLang = _MAC_LANGUAGE_CODES.get(language.lower()) 370 macScript = _MAC_LANGUAGE_TO_SCRIPT.get(macLang) 371 if macLang is not None and macScript is not None: 372 encoding = getEncoding(1, macScript, macLang, default="ascii") 373 # Check if we can actually encode this name. If we can't, 374 # for example because we have no support for the legacy 375 # encoding, or because the name string contains Unicode 376 # characters that the legacy encoding cannot represent, 377 # we fall back to encoding the name in Unicode and put 378 # the language tag into the ltag table. 379 try: 380 _ = tobytes(name, encoding, errors="strict") 381 return makeName(name, nameID, 1, macScript, macLang) 382 except UnicodeEncodeError: 383 pass 384 if font is not None: 385 ltag = font.tables.get("ltag") 386 if ltag is None: 387 ltag = font["ltag"] = newTable("ltag") 388 # 0 = Unicode; 4 = “Unicode 2.0 or later semantics (non-BMP characters allowed)” 389 # “The preferred platform-specific code for Unicode would be 3 or 4.” 390 # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html 391 return makeName(name, nameID, 0, 4, ltag.addTag(language)) 392 else: 393 log.warning("cannot store language %s into 'ltag' table " 394 "without having access to the TTFont object" % 395 language) 396 return None 397 398 399class NameRecord(object): 400 401 def getEncoding(self, default='ascii'): 402 """Returns the Python encoding name for this name entry based on its platformID, 403 platEncID, and langID. If encoding for these values is not known, by default 404 'ascii' is returned. That can be overriden by passing a value to the default 405 argument. 406 """ 407 return getEncoding(self.platformID, self.platEncID, self.langID, default) 408 409 def encodingIsUnicodeCompatible(self): 410 return self.getEncoding(None) in ['utf_16_be', 'ucs2be', 'ascii', 'latin1'] 411 412 def __str__(self): 413 return self.toStr(errors='backslashreplace') 414 415 def isUnicode(self): 416 return (self.platformID == 0 or 417 (self.platformID == 3 and self.platEncID in [0, 1, 10])) 418 419 def toUnicode(self, errors='strict'): 420 """ 421 If self.string is a Unicode string, return it; otherwise try decoding the 422 bytes in self.string to a Unicode string using the encoding of this 423 entry as returned by self.getEncoding(); Note that self.getEncoding() 424 returns 'ascii' if the encoding is unknown to the library. 425 426 Certain heuristics are performed to recover data from bytes that are 427 ill-formed in the chosen encoding, or that otherwise look misencoded 428 (mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE 429 but marked otherwise). If the bytes are ill-formed and the heuristics fail, 430 the error is handled according to the errors parameter to this function, which is 431 passed to the underlying decode() function; by default it throws a 432 UnicodeDecodeError exception. 433 434 Note: The mentioned heuristics mean that roundtripping a font to XML and back 435 to binary might recover some misencoded data whereas just loading the font 436 and saving it back will not change them. 437 """ 438 def isascii(b): 439 return (b >= 0x20 and b <= 0x7E) or b in [0x09, 0x0A, 0x0D] 440 encoding = self.getEncoding() 441 string = self.string 442 443 if isinstance(string, bytes) and encoding == 'utf_16_be' and len(string) % 2 == 1: 444 # Recover badly encoded UTF-16 strings that have an odd number of bytes: 445 # - If the last byte is zero, drop it. Otherwise, 446 # - If all the odd bytes are zero and all the even bytes are ASCII, 447 # prepend one zero byte. Otherwise, 448 # - If first byte is zero and all other bytes are ASCII, insert zero 449 # bytes between consecutive ASCII bytes. 450 # 451 # (Yes, I've seen all of these in the wild... sigh) 452 if byteord(string[-1]) == 0: 453 string = string[:-1] 454 elif all(byteord(b) == 0 if i % 2 else isascii(byteord(b)) for i,b in enumerate(string)): 455 string = b'\0' + string 456 elif byteord(string[0]) == 0 and all(isascii(byteord(b)) for b in string[1:]): 457 string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:]) 458 459 string = tostr(string, encoding=encoding, errors=errors) 460 461 # If decoded strings still looks like UTF-16BE, it suggests a double-encoding. 462 # Fix it up. 463 if all(ord(c) == 0 if i % 2 == 0 else isascii(ord(c)) for i,c in enumerate(string)): 464 # If string claims to be Mac encoding, but looks like UTF-16BE with ASCII text, 465 # narrow it down. 466 string = ''.join(c for c in string[1::2]) 467 468 return string 469 470 def toBytes(self, errors='strict'): 471 """ If self.string is a bytes object, return it; otherwise try encoding 472 the Unicode string in self.string to bytes using the encoding of this 473 entry as returned by self.getEncoding(); Note that self.getEncoding() 474 returns 'ascii' if the encoding is unknown to the library. 475 476 If the Unicode string cannot be encoded to bytes in the chosen encoding, 477 the error is handled according to the errors parameter to this function, 478 which is passed to the underlying encode() function; by default it throws a 479 UnicodeEncodeError exception. 480 """ 481 return tobytes(self.string, encoding=self.getEncoding(), errors=errors) 482 483 toStr = toUnicode 484 485 def toXML(self, writer, ttFont): 486 try: 487 unistr = self.toUnicode() 488 except UnicodeDecodeError: 489 unistr = None 490 attrs = [ 491 ("nameID", self.nameID), 492 ("platformID", self.platformID), 493 ("platEncID", self.platEncID), 494 ("langID", hex(self.langID)), 495 ] 496 497 if unistr is None or not self.encodingIsUnicodeCompatible(): 498 attrs.append(("unicode", unistr is not None)) 499 500 writer.begintag("namerecord", attrs) 501 writer.newline() 502 if unistr is not None: 503 writer.write(unistr) 504 else: 505 writer.write8bit(self.string) 506 writer.newline() 507 writer.endtag("namerecord") 508 writer.newline() 509 510 def fromXML(self, name, attrs, content, ttFont): 511 self.nameID = safeEval(attrs["nameID"]) 512 self.platformID = safeEval(attrs["platformID"]) 513 self.platEncID = safeEval(attrs["platEncID"]) 514 self.langID = safeEval(attrs["langID"]) 515 s = strjoin(content).strip() 516 encoding = self.getEncoding() 517 if self.encodingIsUnicodeCompatible() or safeEval(attrs.get("unicode", "False")): 518 self.string = s.encode(encoding) 519 else: 520 # This is the inverse of write8bit... 521 self.string = s.encode("latin1") 522 523 def __lt__(self, other): 524 if type(self) != type(other): 525 return NotImplemented 526 527 try: 528 # implemented so that list.sort() sorts according to the spec. 529 selfTuple = ( 530 self.platformID, 531 self.platEncID, 532 self.langID, 533 self.nameID, 534 self.toBytes(), 535 ) 536 otherTuple = ( 537 other.platformID, 538 other.platEncID, 539 other.langID, 540 other.nameID, 541 other.toBytes(), 542 ) 543 return selfTuple < otherTuple 544 except (UnicodeEncodeError, AttributeError): 545 # This can only happen for 546 # 1) an object that is not a NameRecord, or 547 # 2) an unlikely incomplete NameRecord object which has not been 548 # fully populated, or 549 # 3) when all IDs are identical but the strings can't be encoded 550 # for their platform encoding. 551 # In all cases it is best to return NotImplemented. 552 return NotImplemented 553 554 def __repr__(self): 555 return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % ( 556 self.nameID, self.platformID, self.langID) 557 558 559# Windows language ID → IETF BCP-47 language tag 560# 561# While Microsoft indicates a region/country for all its language 562# IDs, we follow Unicode practice by omitting “most likely subtags” 563# as per Unicode CLDR. For example, English is simply “en” and not 564# “en-Latn” because according to Unicode, the default script 565# for English is Latin. 566# 567# http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html 568# http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 569_WINDOWS_LANGUAGES = { 570 0x0436: 'af', 571 0x041C: 'sq', 572 0x0484: 'gsw', 573 0x045E: 'am', 574 0x1401: 'ar-DZ', 575 0x3C01: 'ar-BH', 576 0x0C01: 'ar', 577 0x0801: 'ar-IQ', 578 0x2C01: 'ar-JO', 579 0x3401: 'ar-KW', 580 0x3001: 'ar-LB', 581 0x1001: 'ar-LY', 582 0x1801: 'ary', 583 0x2001: 'ar-OM', 584 0x4001: 'ar-QA', 585 0x0401: 'ar-SA', 586 0x2801: 'ar-SY', 587 0x1C01: 'aeb', 588 0x3801: 'ar-AE', 589 0x2401: 'ar-YE', 590 0x042B: 'hy', 591 0x044D: 'as', 592 0x082C: 'az-Cyrl', 593 0x042C: 'az', 594 0x046D: 'ba', 595 0x042D: 'eu', 596 0x0423: 'be', 597 0x0845: 'bn', 598 0x0445: 'bn-IN', 599 0x201A: 'bs-Cyrl', 600 0x141A: 'bs', 601 0x047E: 'br', 602 0x0402: 'bg', 603 0x0403: 'ca', 604 0x0C04: 'zh-HK', 605 0x1404: 'zh-MO', 606 0x0804: 'zh', 607 0x1004: 'zh-SG', 608 0x0404: 'zh-TW', 609 0x0483: 'co', 610 0x041A: 'hr', 611 0x101A: 'hr-BA', 612 0x0405: 'cs', 613 0x0406: 'da', 614 0x048C: 'prs', 615 0x0465: 'dv', 616 0x0813: 'nl-BE', 617 0x0413: 'nl', 618 0x0C09: 'en-AU', 619 0x2809: 'en-BZ', 620 0x1009: 'en-CA', 621 0x2409: 'en-029', 622 0x4009: 'en-IN', 623 0x1809: 'en-IE', 624 0x2009: 'en-JM', 625 0x4409: 'en-MY', 626 0x1409: 'en-NZ', 627 0x3409: 'en-PH', 628 0x4809: 'en-SG', 629 0x1C09: 'en-ZA', 630 0x2C09: 'en-TT', 631 0x0809: 'en-GB', 632 0x0409: 'en', 633 0x3009: 'en-ZW', 634 0x0425: 'et', 635 0x0438: 'fo', 636 0x0464: 'fil', 637 0x040B: 'fi', 638 0x080C: 'fr-BE', 639 0x0C0C: 'fr-CA', 640 0x040C: 'fr', 641 0x140C: 'fr-LU', 642 0x180C: 'fr-MC', 643 0x100C: 'fr-CH', 644 0x0462: 'fy', 645 0x0456: 'gl', 646 0x0437: 'ka', 647 0x0C07: 'de-AT', 648 0x0407: 'de', 649 0x1407: 'de-LI', 650 0x1007: 'de-LU', 651 0x0807: 'de-CH', 652 0x0408: 'el', 653 0x046F: 'kl', 654 0x0447: 'gu', 655 0x0468: 'ha', 656 0x040D: 'he', 657 0x0439: 'hi', 658 0x040E: 'hu', 659 0x040F: 'is', 660 0x0470: 'ig', 661 0x0421: 'id', 662 0x045D: 'iu', 663 0x085D: 'iu-Latn', 664 0x083C: 'ga', 665 0x0434: 'xh', 666 0x0435: 'zu', 667 0x0410: 'it', 668 0x0810: 'it-CH', 669 0x0411: 'ja', 670 0x044B: 'kn', 671 0x043F: 'kk', 672 0x0453: 'km', 673 0x0486: 'quc', 674 0x0487: 'rw', 675 0x0441: 'sw', 676 0x0457: 'kok', 677 0x0412: 'ko', 678 0x0440: 'ky', 679 0x0454: 'lo', 680 0x0426: 'lv', 681 0x0427: 'lt', 682 0x082E: 'dsb', 683 0x046E: 'lb', 684 0x042F: 'mk', 685 0x083E: 'ms-BN', 686 0x043E: 'ms', 687 0x044C: 'ml', 688 0x043A: 'mt', 689 0x0481: 'mi', 690 0x047A: 'arn', 691 0x044E: 'mr', 692 0x047C: 'moh', 693 0x0450: 'mn', 694 0x0850: 'mn-CN', 695 0x0461: 'ne', 696 0x0414: 'nb', 697 0x0814: 'nn', 698 0x0482: 'oc', 699 0x0448: 'or', 700 0x0463: 'ps', 701 0x0415: 'pl', 702 0x0416: 'pt', 703 0x0816: 'pt-PT', 704 0x0446: 'pa', 705 0x046B: 'qu-BO', 706 0x086B: 'qu-EC', 707 0x0C6B: 'qu', 708 0x0418: 'ro', 709 0x0417: 'rm', 710 0x0419: 'ru', 711 0x243B: 'smn', 712 0x103B: 'smj-NO', 713 0x143B: 'smj', 714 0x0C3B: 'se-FI', 715 0x043B: 'se', 716 0x083B: 'se-SE', 717 0x203B: 'sms', 718 0x183B: 'sma-NO', 719 0x1C3B: 'sms', 720 0x044F: 'sa', 721 0x1C1A: 'sr-Cyrl-BA', 722 0x0C1A: 'sr', 723 0x181A: 'sr-Latn-BA', 724 0x081A: 'sr-Latn', 725 0x046C: 'nso', 726 0x0432: 'tn', 727 0x045B: 'si', 728 0x041B: 'sk', 729 0x0424: 'sl', 730 0x2C0A: 'es-AR', 731 0x400A: 'es-BO', 732 0x340A: 'es-CL', 733 0x240A: 'es-CO', 734 0x140A: 'es-CR', 735 0x1C0A: 'es-DO', 736 0x300A: 'es-EC', 737 0x440A: 'es-SV', 738 0x100A: 'es-GT', 739 0x480A: 'es-HN', 740 0x080A: 'es-MX', 741 0x4C0A: 'es-NI', 742 0x180A: 'es-PA', 743 0x3C0A: 'es-PY', 744 0x280A: 'es-PE', 745 0x500A: 'es-PR', 746 747 # Microsoft has defined two different language codes for 748 # “Spanish with modern sorting” and “Spanish with traditional 749 # sorting”. This makes sense for collation APIs, and it would be 750 # possible to express this in BCP 47 language tags via Unicode 751 # extensions (eg., “es-u-co-trad” is “Spanish with traditional 752 # sorting”). However, for storing names in fonts, this distinction 753 # does not make sense, so we use “es” in both cases. 754 0x0C0A: 'es', 755 0x040A: 'es', 756 757 0x540A: 'es-US', 758 0x380A: 'es-UY', 759 0x200A: 'es-VE', 760 0x081D: 'sv-FI', 761 0x041D: 'sv', 762 0x045A: 'syr', 763 0x0428: 'tg', 764 0x085F: 'tzm', 765 0x0449: 'ta', 766 0x0444: 'tt', 767 0x044A: 'te', 768 0x041E: 'th', 769 0x0451: 'bo', 770 0x041F: 'tr', 771 0x0442: 'tk', 772 0x0480: 'ug', 773 0x0422: 'uk', 774 0x042E: 'hsb', 775 0x0420: 'ur', 776 0x0843: 'uz-Cyrl', 777 0x0443: 'uz', 778 0x042A: 'vi', 779 0x0452: 'cy', 780 0x0488: 'wo', 781 0x0485: 'sah', 782 0x0478: 'ii', 783 0x046A: 'yo', 784} 785 786 787_MAC_LANGUAGES = { 788 0: 'en', 789 1: 'fr', 790 2: 'de', 791 3: 'it', 792 4: 'nl', 793 5: 'sv', 794 6: 'es', 795 7: 'da', 796 8: 'pt', 797 9: 'no', 798 10: 'he', 799 11: 'ja', 800 12: 'ar', 801 13: 'fi', 802 14: 'el', 803 15: 'is', 804 16: 'mt', 805 17: 'tr', 806 18: 'hr', 807 19: 'zh-Hant', 808 20: 'ur', 809 21: 'hi', 810 22: 'th', 811 23: 'ko', 812 24: 'lt', 813 25: 'pl', 814 26: 'hu', 815 27: 'es', 816 28: 'lv', 817 29: 'se', 818 30: 'fo', 819 31: 'fa', 820 32: 'ru', 821 33: 'zh', 822 34: 'nl-BE', 823 35: 'ga', 824 36: 'sq', 825 37: 'ro', 826 38: 'cz', 827 39: 'sk', 828 40: 'sl', 829 41: 'yi', 830 42: 'sr', 831 43: 'mk', 832 44: 'bg', 833 45: 'uk', 834 46: 'be', 835 47: 'uz', 836 48: 'kk', 837 49: 'az-Cyrl', 838 50: 'az-Arab', 839 51: 'hy', 840 52: 'ka', 841 53: 'mo', 842 54: 'ky', 843 55: 'tg', 844 56: 'tk', 845 57: 'mn-CN', 846 58: 'mn', 847 59: 'ps', 848 60: 'ks', 849 61: 'ku', 850 62: 'sd', 851 63: 'bo', 852 64: 'ne', 853 65: 'sa', 854 66: 'mr', 855 67: 'bn', 856 68: 'as', 857 69: 'gu', 858 70: 'pa', 859 71: 'or', 860 72: 'ml', 861 73: 'kn', 862 74: 'ta', 863 75: 'te', 864 76: 'si', 865 77: 'my', 866 78: 'km', 867 79: 'lo', 868 80: 'vi', 869 81: 'id', 870 82: 'tl', 871 83: 'ms', 872 84: 'ms-Arab', 873 85: 'am', 874 86: 'ti', 875 87: 'om', 876 88: 'so', 877 89: 'sw', 878 90: 'rw', 879 91: 'rn', 880 92: 'ny', 881 93: 'mg', 882 94: 'eo', 883 128: 'cy', 884 129: 'eu', 885 130: 'ca', 886 131: 'la', 887 132: 'qu', 888 133: 'gn', 889 134: 'ay', 890 135: 'tt', 891 136: 'ug', 892 137: 'dz', 893 138: 'jv', 894 139: 'su', 895 140: 'gl', 896 141: 'af', 897 142: 'br', 898 143: 'iu', 899 144: 'gd', 900 145: 'gv', 901 146: 'ga', 902 147: 'to', 903 148: 'el-polyton', 904 149: 'kl', 905 150: 'az', 906 151: 'nn', 907} 908 909 910_WINDOWS_LANGUAGE_CODES = {lang.lower(): code for code, lang in _WINDOWS_LANGUAGES.items()} 911_MAC_LANGUAGE_CODES = {lang.lower(): code for code, lang in _MAC_LANGUAGES.items()} 912 913 914# MacOS language ID → MacOS script ID 915# 916# Note that the script ID is not sufficient to determine what encoding 917# to use in TrueType files. For some languages, MacOS used a modification 918# of a mainstream script. For example, an Icelandic name would be stored 919# with smRoman in the TrueType naming table, but the actual encoding 920# is a special Icelandic version of the normal Macintosh Roman encoding. 921# As another example, Inuktitut uses an 8-bit encoding for Canadian Aboriginal 922# Syllables but MacOS had run out of available script codes, so this was 923# done as a (pretty radical) “modification” of Ethiopic. 924# 925# http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/Readme.txt 926_MAC_LANGUAGE_TO_SCRIPT = { 927 0: 0, # langEnglish → smRoman 928 1: 0, # langFrench → smRoman 929 2: 0, # langGerman → smRoman 930 3: 0, # langItalian → smRoman 931 4: 0, # langDutch → smRoman 932 5: 0, # langSwedish → smRoman 933 6: 0, # langSpanish → smRoman 934 7: 0, # langDanish → smRoman 935 8: 0, # langPortuguese → smRoman 936 9: 0, # langNorwegian → smRoman 937 10: 5, # langHebrew → smHebrew 938 11: 1, # langJapanese → smJapanese 939 12: 4, # langArabic → smArabic 940 13: 0, # langFinnish → smRoman 941 14: 6, # langGreek → smGreek 942 15: 0, # langIcelandic → smRoman (modified) 943 16: 0, # langMaltese → smRoman 944 17: 0, # langTurkish → smRoman (modified) 945 18: 0, # langCroatian → smRoman (modified) 946 19: 2, # langTradChinese → smTradChinese 947 20: 4, # langUrdu → smArabic 948 21: 9, # langHindi → smDevanagari 949 22: 21, # langThai → smThai 950 23: 3, # langKorean → smKorean 951 24: 29, # langLithuanian → smCentralEuroRoman 952 25: 29, # langPolish → smCentralEuroRoman 953 26: 29, # langHungarian → smCentralEuroRoman 954 27: 29, # langEstonian → smCentralEuroRoman 955 28: 29, # langLatvian → smCentralEuroRoman 956 29: 0, # langSami → smRoman 957 30: 0, # langFaroese → smRoman (modified) 958 31: 4, # langFarsi → smArabic (modified) 959 32: 7, # langRussian → smCyrillic 960 33: 25, # langSimpChinese → smSimpChinese 961 34: 0, # langFlemish → smRoman 962 35: 0, # langIrishGaelic → smRoman (modified) 963 36: 0, # langAlbanian → smRoman 964 37: 0, # langRomanian → smRoman (modified) 965 38: 29, # langCzech → smCentralEuroRoman 966 39: 29, # langSlovak → smCentralEuroRoman 967 40: 0, # langSlovenian → smRoman (modified) 968 41: 5, # langYiddish → smHebrew 969 42: 7, # langSerbian → smCyrillic 970 43: 7, # langMacedonian → smCyrillic 971 44: 7, # langBulgarian → smCyrillic 972 45: 7, # langUkrainian → smCyrillic (modified) 973 46: 7, # langByelorussian → smCyrillic 974 47: 7, # langUzbek → smCyrillic 975 48: 7, # langKazakh → smCyrillic 976 49: 7, # langAzerbaijani → smCyrillic 977 50: 4, # langAzerbaijanAr → smArabic 978 51: 24, # langArmenian → smArmenian 979 52: 23, # langGeorgian → smGeorgian 980 53: 7, # langMoldavian → smCyrillic 981 54: 7, # langKirghiz → smCyrillic 982 55: 7, # langTajiki → smCyrillic 983 56: 7, # langTurkmen → smCyrillic 984 57: 27, # langMongolian → smMongolian 985 58: 7, # langMongolianCyr → smCyrillic 986 59: 4, # langPashto → smArabic 987 60: 4, # langKurdish → smArabic 988 61: 4, # langKashmiri → smArabic 989 62: 4, # langSindhi → smArabic 990 63: 26, # langTibetan → smTibetan 991 64: 9, # langNepali → smDevanagari 992 65: 9, # langSanskrit → smDevanagari 993 66: 9, # langMarathi → smDevanagari 994 67: 13, # langBengali → smBengali 995 68: 13, # langAssamese → smBengali 996 69: 11, # langGujarati → smGujarati 997 70: 10, # langPunjabi → smGurmukhi 998 71: 12, # langOriya → smOriya 999 72: 17, # langMalayalam → smMalayalam 1000 73: 16, # langKannada → smKannada 1001 74: 14, # langTamil → smTamil 1002 75: 15, # langTelugu → smTelugu 1003 76: 18, # langSinhalese → smSinhalese 1004 77: 19, # langBurmese → smBurmese 1005 78: 20, # langKhmer → smKhmer 1006 79: 22, # langLao → smLao 1007 80: 30, # langVietnamese → smVietnamese 1008 81: 0, # langIndonesian → smRoman 1009 82: 0, # langTagalog → smRoman 1010 83: 0, # langMalayRoman → smRoman 1011 84: 4, # langMalayArabic → smArabic 1012 85: 28, # langAmharic → smEthiopic 1013 86: 28, # langTigrinya → smEthiopic 1014 87: 28, # langOromo → smEthiopic 1015 88: 0, # langSomali → smRoman 1016 89: 0, # langSwahili → smRoman 1017 90: 0, # langKinyarwanda → smRoman 1018 91: 0, # langRundi → smRoman 1019 92: 0, # langNyanja → smRoman 1020 93: 0, # langMalagasy → smRoman 1021 94: 0, # langEsperanto → smRoman 1022 128: 0, # langWelsh → smRoman (modified) 1023 129: 0, # langBasque → smRoman 1024 130: 0, # langCatalan → smRoman 1025 131: 0, # langLatin → smRoman 1026 132: 0, # langQuechua → smRoman 1027 133: 0, # langGuarani → smRoman 1028 134: 0, # langAymara → smRoman 1029 135: 7, # langTatar → smCyrillic 1030 136: 4, # langUighur → smArabic 1031 137: 26, # langDzongkha → smTibetan 1032 138: 0, # langJavaneseRom → smRoman 1033 139: 0, # langSundaneseRom → smRoman 1034 140: 0, # langGalician → smRoman 1035 141: 0, # langAfrikaans → smRoman 1036 142: 0, # langBreton → smRoman (modified) 1037 143: 28, # langInuktitut → smEthiopic (modified) 1038 144: 0, # langScottishGaelic → smRoman (modified) 1039 145: 0, # langManxGaelic → smRoman (modified) 1040 146: 0, # langIrishGaelicScript → smRoman (modified) 1041 147: 0, # langTongan → smRoman 1042 148: 6, # langGreekAncient → smRoman 1043 149: 0, # langGreenlandic → smRoman 1044 150: 0, # langAzerbaijanRoman → smRoman 1045 151: 0, # langNynorsk → smRoman 1046} 1047