1"""Internationalization and localization support. 2 3This module provides internationalization (I18N) and localization (L10N) 4support for your Python programs by providing an interface to the GNU gettext 5message catalog library. 6 7I18N refers to the operation by which a program is made aware of multiple 8languages. L10N refers to the adaptation of your program, once 9internationalized, to the local language and cultural habits. 10 11""" 12 13# This module represents the integration of work, contributions, feedback, and 14# suggestions from the following people: 15# 16# Martin von Loewis, who wrote the initial implementation of the underlying 17# C-based libintlmodule (later renamed _gettext), along with a skeletal 18# gettext.py implementation. 19# 20# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule, 21# which also included a pure-Python implementation to read .mo files if 22# intlmodule wasn't available. 23# 24# James Henstridge, who also wrote a gettext.py module, which has some 25# interesting, but currently unsupported experimental features: the notion of 26# a Catalog class and instances, and the ability to add to a catalog file via 27# a Python API. 28# 29# Barry Warsaw integrated these modules, wrote the .install() API and code, 30# and conformed all C and Python code to Python's coding standards. 31# 32# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this 33# module. 34# 35# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs. 36# 37# TODO: 38# - Lazy loading of .mo files. Currently the entire catalog is loaded into 39# memory, but that's probably bad for large translated programs. Instead, 40# the lexical sort of original strings in GNU .mo files should be exploited 41# to do binary searches and lazy initializations. Or you might want to use 42# the undocumented double-hash algorithm for .mo files with hash tables, but 43# you'll need to study the GNU gettext code to do this. 44# 45# - Support Solaris .mo file formats. Unfortunately, we've been unable to 46# find this format documented anywhere. 47 48 49import operator 50import os 51import re 52import sys 53 54 55__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog', 56 'bindtextdomain', 'find', 'translation', 'install', 57 'textdomain', 'dgettext', 'dngettext', 'gettext', 58 'ngettext', 'pgettext', 'dpgettext', 'npgettext', 59 'dnpgettext' 60 ] 61 62_default_localedir = os.path.join(sys.base_prefix, 'share', 'locale') 63 64# Expression parsing for plural form selection. 65# 66# The gettext library supports a small subset of C syntax. The only 67# incompatible difference is that integer literals starting with zero are 68# decimal. 69# 70# https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms 71# http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y 72 73_token_pattern = re.compile(r""" 74 (?P<WHITESPACES>[ \t]+) | # spaces and horizontal tabs 75 (?P<NUMBER>[0-9]+\b) | # decimal integer 76 (?P<NAME>n\b) | # only n is allowed 77 (?P<PARENTHESIS>[()]) | 78 (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >, 79 # <=, >=, ==, !=, &&, ||, 80 # ? : 81 # unary and bitwise ops 82 # not allowed 83 (?P<INVALID>\w+|.) # invalid token 84 """, re.VERBOSE|re.DOTALL) 85 86 87def _tokenize(plural): 88 for mo in re.finditer(_token_pattern, plural): 89 kind = mo.lastgroup 90 if kind == 'WHITESPACES': 91 continue 92 value = mo.group(kind) 93 if kind == 'INVALID': 94 raise ValueError('invalid token in plural form: %s' % value) 95 yield value 96 yield '' 97 98 99def _error(value): 100 if value: 101 return ValueError('unexpected token in plural form: %s' % value) 102 else: 103 return ValueError('unexpected end of plural form') 104 105 106_binary_ops = ( 107 ('||',), 108 ('&&',), 109 ('==', '!='), 110 ('<', '>', '<=', '>='), 111 ('+', '-'), 112 ('*', '/', '%'), 113) 114_binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops} 115_c2py_ops = {'||': 'or', '&&': 'and', '/': '//'} 116 117 118def _parse(tokens, priority=-1): 119 result = '' 120 nexttok = next(tokens) 121 while nexttok == '!': 122 result += 'not ' 123 nexttok = next(tokens) 124 125 if nexttok == '(': 126 sub, nexttok = _parse(tokens) 127 result = '%s(%s)' % (result, sub) 128 if nexttok != ')': 129 raise ValueError('unbalanced parenthesis in plural form') 130 elif nexttok == 'n': 131 result = '%s%s' % (result, nexttok) 132 else: 133 try: 134 value = int(nexttok, 10) 135 except ValueError: 136 raise _error(nexttok) from None 137 result = '%s%d' % (result, value) 138 nexttok = next(tokens) 139 140 j = 100 141 while nexttok in _binary_ops: 142 i = _binary_ops[nexttok] 143 if i < priority: 144 break 145 # Break chained comparisons 146 if i in (3, 4) and j in (3, 4): # '==', '!=', '<', '>', '<=', '>=' 147 result = '(%s)' % result 148 # Replace some C operators by their Python equivalents 149 op = _c2py_ops.get(nexttok, nexttok) 150 right, nexttok = _parse(tokens, i + 1) 151 result = '%s %s %s' % (result, op, right) 152 j = i 153 if j == priority == 4: # '<', '>', '<=', '>=' 154 result = '(%s)' % result 155 156 if nexttok == '?' and priority <= 0: 157 if_true, nexttok = _parse(tokens, 0) 158 if nexttok != ':': 159 raise _error(nexttok) 160 if_false, nexttok = _parse(tokens) 161 result = '%s if %s else %s' % (if_true, result, if_false) 162 if priority == 0: 163 result = '(%s)' % result 164 165 return result, nexttok 166 167 168def _as_int(n): 169 try: 170 round(n) 171 except TypeError: 172 raise TypeError('Plural value must be an integer, got %s' % 173 (n.__class__.__name__,)) from None 174 return _as_int2(n) 175 176def _as_int2(n): 177 try: 178 return operator.index(n) 179 except TypeError: 180 pass 181 182 import warnings 183 frame = sys._getframe(1) 184 stacklevel = 2 185 while frame.f_back is not None and frame.f_globals.get('__name__') == __name__: 186 stacklevel += 1 187 frame = frame.f_back 188 warnings.warn('Plural value must be an integer, got %s' % 189 (n.__class__.__name__,), 190 DeprecationWarning, 191 stacklevel) 192 return n 193 194 195def c2py(plural): 196 """Gets a C expression as used in PO files for plural forms and returns a 197 Python function that implements an equivalent expression. 198 """ 199 200 if len(plural) > 1000: 201 raise ValueError('plural form expression is too long') 202 try: 203 result, nexttok = _parse(_tokenize(plural)) 204 if nexttok: 205 raise _error(nexttok) 206 207 depth = 0 208 for c in result: 209 if c == '(': 210 depth += 1 211 if depth > 20: 212 # Python compiler limit is about 90. 213 # The most complex example has 2. 214 raise ValueError('plural form expression is too complex') 215 elif c == ')': 216 depth -= 1 217 218 ns = {'_as_int': _as_int, '__name__': __name__} 219 exec('''if True: 220 def func(n): 221 if not isinstance(n, int): 222 n = _as_int(n) 223 return int(%s) 224 ''' % result, ns) 225 return ns['func'] 226 except RecursionError: 227 # Recursion error can be raised in _parse() or exec(). 228 raise ValueError('plural form expression is too complex') 229 230 231def _expand_lang(loc): 232 import locale 233 loc = locale.normalize(loc) 234 COMPONENT_CODESET = 1 << 0 235 COMPONENT_TERRITORY = 1 << 1 236 COMPONENT_MODIFIER = 1 << 2 237 # split up the locale into its base components 238 mask = 0 239 pos = loc.find('@') 240 if pos >= 0: 241 modifier = loc[pos:] 242 loc = loc[:pos] 243 mask |= COMPONENT_MODIFIER 244 else: 245 modifier = '' 246 pos = loc.find('.') 247 if pos >= 0: 248 codeset = loc[pos:] 249 loc = loc[:pos] 250 mask |= COMPONENT_CODESET 251 else: 252 codeset = '' 253 pos = loc.find('_') 254 if pos >= 0: 255 territory = loc[pos:] 256 loc = loc[:pos] 257 mask |= COMPONENT_TERRITORY 258 else: 259 territory = '' 260 language = loc 261 ret = [] 262 for i in range(mask+1): 263 if not (i & ~mask): # if all components for this combo exist ... 264 val = language 265 if i & COMPONENT_TERRITORY: val += territory 266 if i & COMPONENT_CODESET: val += codeset 267 if i & COMPONENT_MODIFIER: val += modifier 268 ret.append(val) 269 ret.reverse() 270 return ret 271 272 273class NullTranslations: 274 def __init__(self, fp=None): 275 self._info = {} 276 self._charset = None 277 self._fallback = None 278 if fp is not None: 279 self._parse(fp) 280 281 def _parse(self, fp): 282 pass 283 284 def add_fallback(self, fallback): 285 if self._fallback: 286 self._fallback.add_fallback(fallback) 287 else: 288 self._fallback = fallback 289 290 def gettext(self, message): 291 if self._fallback: 292 return self._fallback.gettext(message) 293 return message 294 295 def ngettext(self, msgid1, msgid2, n): 296 if self._fallback: 297 return self._fallback.ngettext(msgid1, msgid2, n) 298 n = _as_int2(n) 299 if n == 1: 300 return msgid1 301 else: 302 return msgid2 303 304 def pgettext(self, context, message): 305 if self._fallback: 306 return self._fallback.pgettext(context, message) 307 return message 308 309 def npgettext(self, context, msgid1, msgid2, n): 310 if self._fallback: 311 return self._fallback.npgettext(context, msgid1, msgid2, n) 312 n = _as_int2(n) 313 if n == 1: 314 return msgid1 315 else: 316 return msgid2 317 318 def info(self): 319 return self._info 320 321 def charset(self): 322 return self._charset 323 324 def install(self, names=None): 325 import builtins 326 builtins.__dict__['_'] = self.gettext 327 if names is not None: 328 allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'} 329 for name in allowed & set(names): 330 builtins.__dict__[name] = getattr(self, name) 331 332 333class GNUTranslations(NullTranslations): 334 # Magic number of .mo files 335 LE_MAGIC = 0x950412de 336 BE_MAGIC = 0xde120495 337 338 # The encoding of a msgctxt and a msgid in a .mo file is 339 # msgctxt + "\x04" + msgid (gettext version >= 0.15) 340 CONTEXT = "%s\x04%s" 341 342 # Acceptable .mo versions 343 VERSIONS = (0, 1) 344 345 def _get_versions(self, version): 346 """Returns a tuple of major version, minor version""" 347 return (version >> 16, version & 0xffff) 348 349 def _parse(self, fp): 350 """Override this method to support alternative .mo formats.""" 351 # Delay struct import for speeding up gettext import when .mo files 352 # are not used. 353 from struct import unpack 354 filename = getattr(fp, 'name', '') 355 # Parse the .mo file header, which consists of 5 little endian 32 356 # bit words. 357 self._catalog = catalog = {} 358 self.plural = lambda n: int(n != 1) # germanic plural by default 359 buf = fp.read() 360 buflen = len(buf) 361 # Are we big endian or little endian? 362 magic = unpack('<I', buf[:4])[0] 363 if magic == self.LE_MAGIC: 364 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20]) 365 ii = '<II' 366 elif magic == self.BE_MAGIC: 367 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20]) 368 ii = '>II' 369 else: 370 raise OSError(0, 'Bad magic number', filename) 371 372 major_version, minor_version = self._get_versions(version) 373 374 if major_version not in self.VERSIONS: 375 raise OSError(0, 'Bad version number ' + str(major_version), filename) 376 377 # Now put all messages from the .mo file buffer into the catalog 378 # dictionary. 379 for i in range(0, msgcount): 380 mlen, moff = unpack(ii, buf[masteridx:masteridx+8]) 381 mend = moff + mlen 382 tlen, toff = unpack(ii, buf[transidx:transidx+8]) 383 tend = toff + tlen 384 if mend < buflen and tend < buflen: 385 msg = buf[moff:mend] 386 tmsg = buf[toff:tend] 387 else: 388 raise OSError(0, 'File is corrupt', filename) 389 # See if we're looking at GNU .mo conventions for metadata 390 if mlen == 0: 391 # Catalog description 392 lastk = None 393 for b_item in tmsg.split(b'\n'): 394 item = b_item.decode().strip() 395 if not item: 396 continue 397 # Skip over comment lines: 398 if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'): 399 continue 400 k = v = None 401 if ':' in item: 402 k, v = item.split(':', 1) 403 k = k.strip().lower() 404 v = v.strip() 405 self._info[k] = v 406 lastk = k 407 elif lastk: 408 self._info[lastk] += '\n' + item 409 if k == 'content-type': 410 self._charset = v.split('charset=')[1] 411 elif k == 'plural-forms': 412 v = v.split(';') 413 plural = v[1].split('plural=')[1] 414 self.plural = c2py(plural) 415 # Note: we unconditionally convert both msgids and msgstrs to 416 # Unicode using the character encoding specified in the charset 417 # parameter of the Content-Type header. The gettext documentation 418 # strongly encourages msgids to be us-ascii, but some applications 419 # require alternative encodings (e.g. Zope's ZCML and ZPT). For 420 # traditional gettext applications, the msgid conversion will 421 # cause no problems since us-ascii should always be a subset of 422 # the charset encoding. We may want to fall back to 8-bit msgids 423 # if the Unicode conversion fails. 424 charset = self._charset or 'ascii' 425 if b'\x00' in msg: 426 # Plural forms 427 msgid1, msgid2 = msg.split(b'\x00') 428 tmsg = tmsg.split(b'\x00') 429 msgid1 = str(msgid1, charset) 430 for i, x in enumerate(tmsg): 431 catalog[(msgid1, i)] = str(x, charset) 432 else: 433 catalog[str(msg, charset)] = str(tmsg, charset) 434 # advance to next entry in the seek tables 435 masteridx += 8 436 transidx += 8 437 438 def gettext(self, message): 439 missing = object() 440 tmsg = self._catalog.get(message, missing) 441 if tmsg is missing: 442 tmsg = self._catalog.get((message, self.plural(1)), missing) 443 if tmsg is not missing: 444 return tmsg 445 if self._fallback: 446 return self._fallback.gettext(message) 447 return message 448 449 def ngettext(self, msgid1, msgid2, n): 450 try: 451 tmsg = self._catalog[(msgid1, self.plural(n))] 452 except KeyError: 453 if self._fallback: 454 return self._fallback.ngettext(msgid1, msgid2, n) 455 if n == 1: 456 tmsg = msgid1 457 else: 458 tmsg = msgid2 459 return tmsg 460 461 def pgettext(self, context, message): 462 ctxt_msg_id = self.CONTEXT % (context, message) 463 missing = object() 464 tmsg = self._catalog.get(ctxt_msg_id, missing) 465 if tmsg is missing: 466 tmsg = self._catalog.get((ctxt_msg_id, self.plural(1)), missing) 467 if tmsg is not missing: 468 return tmsg 469 if self._fallback: 470 return self._fallback.pgettext(context, message) 471 return message 472 473 def npgettext(self, context, msgid1, msgid2, n): 474 ctxt_msg_id = self.CONTEXT % (context, msgid1) 475 try: 476 tmsg = self._catalog[ctxt_msg_id, self.plural(n)] 477 except KeyError: 478 if self._fallback: 479 return self._fallback.npgettext(context, msgid1, msgid2, n) 480 if n == 1: 481 tmsg = msgid1 482 else: 483 tmsg = msgid2 484 return tmsg 485 486 487# Locate a .mo file using the gettext strategy 488def find(domain, localedir=None, languages=None, all=False): 489 # Get some reasonable defaults for arguments that were not supplied 490 if localedir is None: 491 localedir = _default_localedir 492 if languages is None: 493 languages = [] 494 for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'): 495 val = os.environ.get(envar) 496 if val: 497 languages = val.split(':') 498 break 499 if 'C' not in languages: 500 languages.append('C') 501 # now normalize and expand the languages 502 nelangs = [] 503 for lang in languages: 504 for nelang in _expand_lang(lang): 505 if nelang not in nelangs: 506 nelangs.append(nelang) 507 # select a language 508 if all: 509 result = [] 510 else: 511 result = None 512 for lang in nelangs: 513 if lang == 'C': 514 break 515 mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain) 516 if os.path.exists(mofile): 517 if all: 518 result.append(mofile) 519 else: 520 return mofile 521 return result 522 523 524# a mapping between absolute .mo file path and Translation object 525_translations = {} 526 527 528def translation(domain, localedir=None, languages=None, 529 class_=None, fallback=False): 530 if class_ is None: 531 class_ = GNUTranslations 532 mofiles = find(domain, localedir, languages, all=True) 533 if not mofiles: 534 if fallback: 535 return NullTranslations() 536 from errno import ENOENT 537 raise FileNotFoundError(ENOENT, 538 'No translation file found for domain', domain) 539 # Avoid opening, reading, and parsing the .mo file after it's been done 540 # once. 541 result = None 542 for mofile in mofiles: 543 key = (class_, os.path.abspath(mofile)) 544 t = _translations.get(key) 545 if t is None: 546 with open(mofile, 'rb') as fp: 547 t = _translations.setdefault(key, class_(fp)) 548 # Copy the translation object to allow setting fallbacks and 549 # output charset. All other instance data is shared with the 550 # cached object. 551 # Delay copy import for speeding up gettext import when .mo files 552 # are not used. 553 import copy 554 t = copy.copy(t) 555 if result is None: 556 result = t 557 else: 558 result.add_fallback(t) 559 return result 560 561 562def install(domain, localedir=None, *, names=None): 563 t = translation(domain, localedir, fallback=True) 564 t.install(names) 565 566 567# a mapping b/w domains and locale directories 568_localedirs = {} 569# current global domain, `messages' used for compatibility w/ GNU gettext 570_current_domain = 'messages' 571 572 573def textdomain(domain=None): 574 global _current_domain 575 if domain is not None: 576 _current_domain = domain 577 return _current_domain 578 579 580def bindtextdomain(domain, localedir=None): 581 global _localedirs 582 if localedir is not None: 583 _localedirs[domain] = localedir 584 return _localedirs.get(domain, _default_localedir) 585 586 587def dgettext(domain, message): 588 try: 589 t = translation(domain, _localedirs.get(domain, None)) 590 except OSError: 591 return message 592 return t.gettext(message) 593 594 595def dngettext(domain, msgid1, msgid2, n): 596 try: 597 t = translation(domain, _localedirs.get(domain, None)) 598 except OSError: 599 n = _as_int2(n) 600 if n == 1: 601 return msgid1 602 else: 603 return msgid2 604 return t.ngettext(msgid1, msgid2, n) 605 606 607def dpgettext(domain, context, message): 608 try: 609 t = translation(domain, _localedirs.get(domain, None)) 610 except OSError: 611 return message 612 return t.pgettext(context, message) 613 614 615def dnpgettext(domain, context, msgid1, msgid2, n): 616 try: 617 t = translation(domain, _localedirs.get(domain, None)) 618 except OSError: 619 n = _as_int2(n) 620 if n == 1: 621 return msgid1 622 else: 623 return msgid2 624 return t.npgettext(context, msgid1, msgid2, n) 625 626 627def gettext(message): 628 return dgettext(_current_domain, message) 629 630 631def ngettext(msgid1, msgid2, n): 632 return dngettext(_current_domain, msgid1, msgid2, n) 633 634 635def pgettext(context, message): 636 return dpgettext(_current_domain, context, message) 637 638 639def npgettext(context, msgid1, msgid2, n): 640 return dnpgettext(_current_domain, context, msgid1, msgid2, n) 641 642 643# dcgettext() has been deemed unnecessary and is not implemented. 644 645# James Henstridge's Catalog constructor from GNOME gettext. Documented usage 646# was: 647# 648# import gettext 649# cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR) 650# _ = cat.gettext 651# print _('Hello World') 652 653# The resulting catalog object currently don't support access through a 654# dictionary API, which was supported (but apparently unused) in GNOME 655# gettext. 656 657Catalog = translation 658