1#! /usr/bin/env python3 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@python.org> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] https://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] https://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import importlib.machinery 160import importlib.util 161import sys 162import glob 163import time 164import getopt 165import ast 166import token 167import tokenize 168 169__version__ = '1.5' 170 171default_keywords = ['_'] 172DEFAULTKEYWORDS = ', '.join(default_keywords) 173 174EMPTYSTRING = '' 175 176 177 178# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 179# there. 180pot_header = _('''\ 181# SOME DESCRIPTIVE TITLE. 182# Copyright (C) YEAR ORGANIZATION 183# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 184# 185msgid "" 186msgstr "" 187"Project-Id-Version: PACKAGE VERSION\\n" 188"POT-Creation-Date: %(time)s\\n" 189"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 190"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 191"Language-Team: LANGUAGE <LL@li.org>\\n" 192"MIME-Version: 1.0\\n" 193"Content-Type: text/plain; charset=%(charset)s\\n" 194"Content-Transfer-Encoding: %(encoding)s\\n" 195"Generated-By: pygettext.py %(version)s\\n" 196 197''') 198 199 200def usage(code, msg=''): 201 print(__doc__ % globals(), file=sys.stderr) 202 if msg: 203 print(msg, file=sys.stderr) 204 sys.exit(code) 205 206 207 208def make_escapes(pass_nonascii): 209 global escapes, escape 210 if pass_nonascii: 211 # Allow non-ascii characters to pass through so that e.g. 'msgid 212 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 213 # escape any character outside the 32..126 range. 214 mod = 128 215 escape = escape_ascii 216 else: 217 mod = 256 218 escape = escape_nonascii 219 escapes = [r"\%03o" % i for i in range(mod)] 220 for i in range(32, 127): 221 escapes[i] = chr(i) 222 escapes[ord('\\')] = r'\\' 223 escapes[ord('\t')] = r'\t' 224 escapes[ord('\r')] = r'\r' 225 escapes[ord('\n')] = r'\n' 226 escapes[ord('\"')] = r'\"' 227 228 229def escape_ascii(s, encoding): 230 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 231 232def escape_nonascii(s, encoding): 233 return ''.join(escapes[b] for b in s.encode(encoding)) 234 235 236def is_literal_string(s): 237 return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') 238 239 240def safe_eval(s): 241 # unwrap quotes, safely 242 return eval(s, {'__builtins__':{}}, {}) 243 244 245def normalize(s, encoding): 246 # This converts the various Python string types into a format that is 247 # appropriate for .po files, namely much closer to C style. 248 lines = s.split('\n') 249 if len(lines) == 1: 250 s = '"' + escape(s, encoding) + '"' 251 else: 252 if not lines[-1]: 253 del lines[-1] 254 lines[-1] = lines[-1] + '\n' 255 for i in range(len(lines)): 256 lines[i] = escape(lines[i], encoding) 257 lineterm = '\\n"\n"' 258 s = '""\n"' + lineterm.join(lines) + '"' 259 return s 260 261 262def containsAny(str, set): 263 """Check whether 'str' contains ANY of the chars in 'set'""" 264 return 1 in [c in str for c in set] 265 266 267def getFilesForName(name): 268 """Get a list of module files for a filename, a module or package name, 269 or a directory. 270 """ 271 if not os.path.exists(name): 272 # check for glob chars 273 if containsAny(name, "*?[]"): 274 files = glob.glob(name) 275 list = [] 276 for file in files: 277 list.extend(getFilesForName(file)) 278 return list 279 280 # try to find module or package 281 try: 282 spec = importlib.util.find_spec(name) 283 name = spec.origin 284 except ImportError: 285 name = None 286 if not name: 287 return [] 288 289 if os.path.isdir(name): 290 # find all python files in directory 291 list = [] 292 # get extension for python source files 293 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 294 for root, dirs, files in os.walk(name): 295 # don't recurse into CVS directories 296 if 'CVS' in dirs: 297 dirs.remove('CVS') 298 # add all *.py files to list 299 list.extend( 300 [os.path.join(root, file) for file in files 301 if os.path.splitext(file)[1] == _py_ext] 302 ) 303 return list 304 elif os.path.exists(name): 305 # a single file 306 return [name] 307 308 return [] 309 310 311class TokenEater: 312 def __init__(self, options): 313 self.__options = options 314 self.__messages = {} 315 self.__state = self.__waiting 316 self.__data = [] 317 self.__lineno = -1 318 self.__freshmodule = 1 319 self.__curfile = None 320 self.__enclosurecount = 0 321 322 def __call__(self, ttype, tstring, stup, etup, line): 323 # dispatch 324## import token 325## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, 326## file=sys.stderr) 327 self.__state(ttype, tstring, stup[0]) 328 329 def __waiting(self, ttype, tstring, lineno): 330 opts = self.__options 331 # Do docstring extractions, if enabled 332 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 333 # module docstring? 334 if self.__freshmodule: 335 if ttype == tokenize.STRING and is_literal_string(tstring): 336 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 337 self.__freshmodule = 0 338 elif ttype not in (tokenize.COMMENT, tokenize.NL): 339 self.__freshmodule = 0 340 return 341 # class or func/method docstring? 342 if ttype == tokenize.NAME and tstring in ('class', 'def'): 343 self.__state = self.__suiteseen 344 return 345 if ttype == tokenize.NAME and tstring in opts.keywords: 346 self.__state = self.__keywordseen 347 return 348 if ttype == tokenize.STRING: 349 maybe_fstring = ast.parse(tstring, mode='eval').body 350 if not isinstance(maybe_fstring, ast.JoinedStr): 351 return 352 for value in filter(lambda node: isinstance(node, ast.FormattedValue), 353 maybe_fstring.values): 354 for call in filter(lambda node: isinstance(node, ast.Call), 355 ast.walk(value)): 356 func = call.func 357 if isinstance(func, ast.Name): 358 func_name = func.id 359 elif isinstance(func, ast.Attribute): 360 func_name = func.attr 361 else: 362 continue 363 364 if func_name not in opts.keywords: 365 continue 366 if len(call.args) != 1: 367 print(_( 368 '*** %(file)s:%(lineno)s: Seen unexpected amount of' 369 ' positional arguments in gettext call: %(source_segment)s' 370 ) % { 371 'source_segment': ast.get_source_segment(tstring, call) or tstring, 372 'file': self.__curfile, 373 'lineno': lineno 374 }, file=sys.stderr) 375 continue 376 if call.keywords: 377 print(_( 378 '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments' 379 ' in gettext call: %(source_segment)s' 380 ) % { 381 'source_segment': ast.get_source_segment(tstring, call) or tstring, 382 'file': self.__curfile, 383 'lineno': lineno 384 }, file=sys.stderr) 385 continue 386 arg = call.args[0] 387 if not isinstance(arg, ast.Constant): 388 print(_( 389 '*** %(file)s:%(lineno)s: Seen unexpected argument type' 390 ' in gettext call: %(source_segment)s' 391 ) % { 392 'source_segment': ast.get_source_segment(tstring, call) or tstring, 393 'file': self.__curfile, 394 'lineno': lineno 395 }, file=sys.stderr) 396 continue 397 if isinstance(arg.value, str): 398 self.__addentry(arg.value, lineno) 399 400 def __suiteseen(self, ttype, tstring, lineno): 401 # skip over any enclosure pairs until we see the colon 402 if ttype == tokenize.OP: 403 if tstring == ':' and self.__enclosurecount == 0: 404 # we see a colon and we're not in an enclosure: end of def 405 self.__state = self.__suitedocstring 406 elif tstring in '([{': 407 self.__enclosurecount += 1 408 elif tstring in ')]}': 409 self.__enclosurecount -= 1 410 411 def __suitedocstring(self, ttype, tstring, lineno): 412 # ignore any intervening noise 413 if ttype == tokenize.STRING and is_literal_string(tstring): 414 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 415 self.__state = self.__waiting 416 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 417 tokenize.COMMENT): 418 # there was no class docstring 419 self.__state = self.__waiting 420 421 def __keywordseen(self, ttype, tstring, lineno): 422 if ttype == tokenize.OP and tstring == '(': 423 self.__data = [] 424 self.__lineno = lineno 425 self.__state = self.__openseen 426 else: 427 self.__state = self.__waiting 428 429 def __openseen(self, ttype, tstring, lineno): 430 if ttype == tokenize.OP and tstring == ')': 431 # We've seen the last of the translatable strings. Record the 432 # line number of the first line of the strings and update the list 433 # of messages seen. Reset state for the next batch. If there 434 # were no strings inside _(), then just ignore this entry. 435 if self.__data: 436 self.__addentry(EMPTYSTRING.join(self.__data)) 437 self.__state = self.__waiting 438 elif ttype == tokenize.STRING and is_literal_string(tstring): 439 self.__data.append(safe_eval(tstring)) 440 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 441 token.NEWLINE, tokenize.NL]: 442 # warn if we see anything else than STRING or whitespace 443 print(_( 444 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 445 ) % { 446 'token': tstring, 447 'file': self.__curfile, 448 'lineno': self.__lineno 449 }, file=sys.stderr) 450 self.__state = self.__waiting 451 452 def __addentry(self, msg, lineno=None, isdocstring=0): 453 if lineno is None: 454 lineno = self.__lineno 455 if not msg in self.__options.toexclude: 456 entry = (self.__curfile, lineno) 457 self.__messages.setdefault(msg, {})[entry] = isdocstring 458 459 def set_filename(self, filename): 460 self.__curfile = filename 461 self.__freshmodule = 1 462 463 def write(self, fp): 464 options = self.__options 465 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 466 encoding = fp.encoding if fp.encoding else 'UTF-8' 467 print(pot_header % {'time': timestamp, 'version': __version__, 468 'charset': encoding, 469 'encoding': '8bit'}, file=fp) 470 # Sort the entries. First sort each particular entry's keys, then 471 # sort all the entries by their first item. 472 reverse = {} 473 for k, v in self.__messages.items(): 474 keys = sorted(v.keys()) 475 reverse.setdefault(tuple(keys), []).append((k, v)) 476 rkeys = sorted(reverse.keys()) 477 for rkey in rkeys: 478 rentries = reverse[rkey] 479 rentries.sort() 480 for k, v in rentries: 481 # If the entry was gleaned out of a docstring, then add a 482 # comment stating so. This is to aid translators who may wish 483 # to skip translating some unimportant docstrings. 484 isdocstring = any(v.values()) 485 # k is the message string, v is a dictionary-set of (filename, 486 # lineno) tuples. We want to sort the entries in v first by 487 # file name and then by line number. 488 v = sorted(v.keys()) 489 if not options.writelocations: 490 pass 491 # location comments are different b/w Solaris and GNU: 492 elif options.locationstyle == options.SOLARIS: 493 for filename, lineno in v: 494 d = {'filename': filename, 'lineno': lineno} 495 print(_( 496 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 497 elif options.locationstyle == options.GNU: 498 # fit as many locations on one line, as long as the 499 # resulting line length doesn't exceed 'options.width' 500 locline = '#:' 501 for filename, lineno in v: 502 d = {'filename': filename, 'lineno': lineno} 503 s = _(' %(filename)s:%(lineno)d') % d 504 if len(locline) + len(s) <= options.width: 505 locline = locline + s 506 else: 507 print(locline, file=fp) 508 locline = "#:" + s 509 if len(locline) > 2: 510 print(locline, file=fp) 511 if isdocstring: 512 print('#, docstring', file=fp) 513 print('msgid', normalize(k, encoding), file=fp) 514 print('msgstr ""\n', file=fp) 515 516 517 518def main(): 519 global default_keywords 520 try: 521 opts, args = getopt.getopt( 522 sys.argv[1:], 523 'ad:DEhk:Kno:p:S:Vvw:x:X:', 524 ['extract-all', 'default-domain=', 'escape', 'help', 525 'keyword=', 'no-default-keywords', 526 'add-location', 'no-location', 'output=', 'output-dir=', 527 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 528 'docstrings', 'no-docstrings', 529 ]) 530 except getopt.error as msg: 531 usage(1, msg) 532 533 # for holding option values 534 class Options: 535 # constants 536 GNU = 1 537 SOLARIS = 2 538 # defaults 539 extractall = 0 # FIXME: currently this option has no effect at all. 540 escape = 0 541 keywords = [] 542 outpath = '' 543 outfile = 'messages.pot' 544 writelocations = 1 545 locationstyle = GNU 546 verbose = 0 547 width = 78 548 excludefilename = '' 549 docstrings = 0 550 nodocstrings = {} 551 552 options = Options() 553 locations = {'gnu' : options.GNU, 554 'solaris' : options.SOLARIS, 555 } 556 557 # parse options 558 for opt, arg in opts: 559 if opt in ('-h', '--help'): 560 usage(0) 561 elif opt in ('-a', '--extract-all'): 562 options.extractall = 1 563 elif opt in ('-d', '--default-domain'): 564 options.outfile = arg + '.pot' 565 elif opt in ('-E', '--escape'): 566 options.escape = 1 567 elif opt in ('-D', '--docstrings'): 568 options.docstrings = 1 569 elif opt in ('-k', '--keyword'): 570 options.keywords.append(arg) 571 elif opt in ('-K', '--no-default-keywords'): 572 default_keywords = [] 573 elif opt in ('-n', '--add-location'): 574 options.writelocations = 1 575 elif opt in ('--no-location',): 576 options.writelocations = 0 577 elif opt in ('-S', '--style'): 578 options.locationstyle = locations.get(arg.lower()) 579 if options.locationstyle is None: 580 usage(1, _('Invalid value for --style: %s') % arg) 581 elif opt in ('-o', '--output'): 582 options.outfile = arg 583 elif opt in ('-p', '--output-dir'): 584 options.outpath = arg 585 elif opt in ('-v', '--verbose'): 586 options.verbose = 1 587 elif opt in ('-V', '--version'): 588 print(_('pygettext.py (xgettext for Python) %s') % __version__) 589 sys.exit(0) 590 elif opt in ('-w', '--width'): 591 try: 592 options.width = int(arg) 593 except ValueError: 594 usage(1, _('--width argument must be an integer: %s') % arg) 595 elif opt in ('-x', '--exclude-file'): 596 options.excludefilename = arg 597 elif opt in ('-X', '--no-docstrings'): 598 fp = open(arg) 599 try: 600 while 1: 601 line = fp.readline() 602 if not line: 603 break 604 options.nodocstrings[line[:-1]] = 1 605 finally: 606 fp.close() 607 608 # calculate escapes 609 make_escapes(not options.escape) 610 611 # calculate all keywords 612 options.keywords.extend(default_keywords) 613 614 # initialize list of strings to exclude 615 if options.excludefilename: 616 try: 617 with open(options.excludefilename) as fp: 618 options.toexclude = fp.readlines() 619 except IOError: 620 print(_( 621 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 622 sys.exit(1) 623 else: 624 options.toexclude = [] 625 626 # resolve args to module lists 627 expanded = [] 628 for arg in args: 629 if arg == '-': 630 expanded.append(arg) 631 else: 632 expanded.extend(getFilesForName(arg)) 633 args = expanded 634 635 # slurp through all the files 636 eater = TokenEater(options) 637 for filename in args: 638 if filename == '-': 639 if options.verbose: 640 print(_('Reading standard input')) 641 fp = sys.stdin.buffer 642 closep = 0 643 else: 644 if options.verbose: 645 print(_('Working on %s') % filename) 646 fp = open(filename, 'rb') 647 closep = 1 648 try: 649 eater.set_filename(filename) 650 try: 651 tokens = tokenize.tokenize(fp.readline) 652 for _token in tokens: 653 eater(*_token) 654 except tokenize.TokenError as e: 655 print('%s: %s, line %d, column %d' % ( 656 e.args[0], filename, e.args[1][0], e.args[1][1]), 657 file=sys.stderr) 658 finally: 659 if closep: 660 fp.close() 661 662 # write the output 663 if options.outfile == '-': 664 fp = sys.stdout 665 closep = 0 666 else: 667 if options.outpath: 668 options.outfile = os.path.join(options.outpath, options.outfile) 669 fp = open(options.outfile, 'w') 670 closep = 1 671 try: 672 eater.write(fp) 673 finally: 674 if closep: 675 fp.close() 676 677 678if __name__ == '__main__': 679 main() 680 # some more test strings 681 # this one creates a warning 682 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 683 _('more' 'than' 'one' 'string') 684