1#! /usr/bin/env python3 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@python.org> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] https://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] https://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import importlib.machinery 160import importlib.util 161import sys 162import glob 163import time 164import getopt 165import ast 166import token 167import tokenize 168 169__version__ = '1.5' 170 171default_keywords = ['_'] 172DEFAULTKEYWORDS = ', '.join(default_keywords) 173 174EMPTYSTRING = '' 175 176 177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 178# there. 179pot_header = _('''\ 180# SOME DESCRIPTIVE TITLE. 181# Copyright (C) YEAR ORGANIZATION 182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 183# 184msgid "" 185msgstr "" 186"Project-Id-Version: PACKAGE VERSION\\n" 187"POT-Creation-Date: %(time)s\\n" 188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 190"Language-Team: LANGUAGE <LL@li.org>\\n" 191"MIME-Version: 1.0\\n" 192"Content-Type: text/plain; charset=%(charset)s\\n" 193"Content-Transfer-Encoding: %(encoding)s\\n" 194"Generated-By: pygettext.py %(version)s\\n" 195 196''') 197 198 199def usage(code, msg=''): 200 print(__doc__ % globals(), file=sys.stderr) 201 if msg: 202 print(msg, file=sys.stderr) 203 sys.exit(code) 204 205 206def make_escapes(pass_nonascii): 207 global escapes, escape 208 if pass_nonascii: 209 # Allow non-ascii characters to pass through so that e.g. 'msgid 210 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 211 # escape any character outside the 32..126 range. 212 mod = 128 213 escape = escape_ascii 214 else: 215 mod = 256 216 escape = escape_nonascii 217 escapes = [r"\%03o" % i for i in range(mod)] 218 for i in range(32, 127): 219 escapes[i] = chr(i) 220 escapes[ord('\\')] = r'\\' 221 escapes[ord('\t')] = r'\t' 222 escapes[ord('\r')] = r'\r' 223 escapes[ord('\n')] = r'\n' 224 escapes[ord('\"')] = r'\"' 225 226 227def escape_ascii(s, encoding): 228 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 229 230def escape_nonascii(s, encoding): 231 return ''.join(escapes[b] for b in s.encode(encoding)) 232 233 234def is_literal_string(s): 235 return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') 236 237 238def safe_eval(s): 239 # unwrap quotes, safely 240 return eval(s, {'__builtins__':{}}, {}) 241 242 243def normalize(s, encoding): 244 # This converts the various Python string types into a format that is 245 # appropriate for .po files, namely much closer to C style. 246 lines = s.split('\n') 247 if len(lines) == 1: 248 s = '"' + escape(s, encoding) + '"' 249 else: 250 if not lines[-1]: 251 del lines[-1] 252 lines[-1] = lines[-1] + '\n' 253 for i in range(len(lines)): 254 lines[i] = escape(lines[i], encoding) 255 lineterm = '\\n"\n"' 256 s = '""\n"' + lineterm.join(lines) + '"' 257 return s 258 259 260def containsAny(str, set): 261 """Check whether 'str' contains ANY of the chars in 'set'""" 262 return 1 in [c in str for c in set] 263 264 265def getFilesForName(name): 266 """Get a list of module files for a filename, a module or package name, 267 or a directory. 268 """ 269 if not os.path.exists(name): 270 # check for glob chars 271 if containsAny(name, "*?[]"): 272 files = glob.glob(name) 273 list = [] 274 for file in files: 275 list.extend(getFilesForName(file)) 276 return list 277 278 # try to find module or package 279 try: 280 spec = importlib.util.find_spec(name) 281 name = spec.origin 282 except ImportError: 283 name = None 284 if not name: 285 return [] 286 287 if os.path.isdir(name): 288 # find all python files in directory 289 list = [] 290 # get extension for python source files 291 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 292 for root, dirs, files in os.walk(name): 293 # don't recurse into CVS directories 294 if 'CVS' in dirs: 295 dirs.remove('CVS') 296 # add all *.py files to list 297 list.extend( 298 [os.path.join(root, file) for file in files 299 if os.path.splitext(file)[1] == _py_ext] 300 ) 301 return list 302 elif os.path.exists(name): 303 # a single file 304 return [name] 305 306 return [] 307 308 309class TokenEater: 310 def __init__(self, options): 311 self.__options = options 312 self.__messages = {} 313 self.__state = self.__waiting 314 self.__data = [] 315 self.__lineno = -1 316 self.__freshmodule = 1 317 self.__curfile = None 318 self.__enclosurecount = 0 319 320 def __call__(self, ttype, tstring, stup, etup, line): 321 # dispatch 322## import token 323## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, 324## file=sys.stderr) 325 self.__state(ttype, tstring, stup[0]) 326 327 def __waiting(self, ttype, tstring, lineno): 328 opts = self.__options 329 # Do docstring extractions, if enabled 330 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 331 # module docstring? 332 if self.__freshmodule: 333 if ttype == tokenize.STRING and is_literal_string(tstring): 334 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 335 self.__freshmodule = 0 336 return 337 if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): 338 return 339 self.__freshmodule = 0 340 # class or func/method docstring? 341 if ttype == tokenize.NAME and tstring in ('class', 'def'): 342 self.__state = self.__suiteseen 343 return 344 if ttype == tokenize.NAME and tstring in ('class', 'def'): 345 self.__state = self.__ignorenext 346 return 347 if ttype == tokenize.NAME and tstring in opts.keywords: 348 self.__state = self.__keywordseen 349 return 350 if ttype == tokenize.STRING: 351 maybe_fstring = ast.parse(tstring, mode='eval').body 352 if not isinstance(maybe_fstring, ast.JoinedStr): 353 return 354 for value in filter(lambda node: isinstance(node, ast.FormattedValue), 355 maybe_fstring.values): 356 for call in filter(lambda node: isinstance(node, ast.Call), 357 ast.walk(value)): 358 func = call.func 359 if isinstance(func, ast.Name): 360 func_name = func.id 361 elif isinstance(func, ast.Attribute): 362 func_name = func.attr 363 else: 364 continue 365 366 if func_name not in opts.keywords: 367 continue 368 if len(call.args) != 1: 369 print(_( 370 '*** %(file)s:%(lineno)s: Seen unexpected amount of' 371 ' positional arguments in gettext call: %(source_segment)s' 372 ) % { 373 'source_segment': ast.get_source_segment(tstring, call) or tstring, 374 'file': self.__curfile, 375 'lineno': lineno 376 }, file=sys.stderr) 377 continue 378 if call.keywords: 379 print(_( 380 '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments' 381 ' in gettext call: %(source_segment)s' 382 ) % { 383 'source_segment': ast.get_source_segment(tstring, call) or tstring, 384 'file': self.__curfile, 385 'lineno': lineno 386 }, file=sys.stderr) 387 continue 388 arg = call.args[0] 389 if not isinstance(arg, ast.Constant): 390 print(_( 391 '*** %(file)s:%(lineno)s: Seen unexpected argument type' 392 ' in gettext call: %(source_segment)s' 393 ) % { 394 'source_segment': ast.get_source_segment(tstring, call) or tstring, 395 'file': self.__curfile, 396 'lineno': lineno 397 }, file=sys.stderr) 398 continue 399 if isinstance(arg.value, str): 400 self.__addentry(arg.value, lineno) 401 402 def __suiteseen(self, ttype, tstring, lineno): 403 # skip over any enclosure pairs until we see the colon 404 if ttype == tokenize.OP: 405 if tstring == ':' and self.__enclosurecount == 0: 406 # we see a colon and we're not in an enclosure: end of def 407 self.__state = self.__suitedocstring 408 elif tstring in '([{': 409 self.__enclosurecount += 1 410 elif tstring in ')]}': 411 self.__enclosurecount -= 1 412 413 def __suitedocstring(self, ttype, tstring, lineno): 414 # ignore any intervening noise 415 if ttype == tokenize.STRING and is_literal_string(tstring): 416 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 417 self.__state = self.__waiting 418 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 419 tokenize.COMMENT): 420 # there was no class docstring 421 self.__state = self.__waiting 422 423 def __keywordseen(self, ttype, tstring, lineno): 424 if ttype == tokenize.OP and tstring == '(': 425 self.__data = [] 426 self.__lineno = lineno 427 self.__state = self.__openseen 428 else: 429 self.__state = self.__waiting 430 431 def __openseen(self, ttype, tstring, lineno): 432 if ttype == tokenize.OP and tstring == ')': 433 # We've seen the last of the translatable strings. Record the 434 # line number of the first line of the strings and update the list 435 # of messages seen. Reset state for the next batch. If there 436 # were no strings inside _(), then just ignore this entry. 437 if self.__data: 438 self.__addentry(EMPTYSTRING.join(self.__data)) 439 self.__state = self.__waiting 440 elif ttype == tokenize.STRING and is_literal_string(tstring): 441 self.__data.append(safe_eval(tstring)) 442 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 443 token.NEWLINE, tokenize.NL]: 444 # warn if we see anything else than STRING or whitespace 445 print(_( 446 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 447 ) % { 448 'token': tstring, 449 'file': self.__curfile, 450 'lineno': self.__lineno 451 }, file=sys.stderr) 452 self.__state = self.__waiting 453 454 def __ignorenext(self, ttype, tstring, lineno): 455 self.__state = self.__waiting 456 457 def __addentry(self, msg, lineno=None, isdocstring=0): 458 if lineno is None: 459 lineno = self.__lineno 460 if not msg in self.__options.toexclude: 461 entry = (self.__curfile, lineno) 462 self.__messages.setdefault(msg, {})[entry] = isdocstring 463 464 def set_filename(self, filename): 465 self.__curfile = filename 466 self.__freshmodule = 1 467 468 def write(self, fp): 469 options = self.__options 470 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 471 encoding = fp.encoding if fp.encoding else 'UTF-8' 472 print(pot_header % {'time': timestamp, 'version': __version__, 473 'charset': encoding, 474 'encoding': '8bit'}, file=fp) 475 # Sort the entries. First sort each particular entry's keys, then 476 # sort all the entries by their first item. 477 reverse = {} 478 for k, v in self.__messages.items(): 479 keys = sorted(v.keys()) 480 reverse.setdefault(tuple(keys), []).append((k, v)) 481 rkeys = sorted(reverse.keys()) 482 for rkey in rkeys: 483 rentries = reverse[rkey] 484 rentries.sort() 485 for k, v in rentries: 486 # If the entry was gleaned out of a docstring, then add a 487 # comment stating so. This is to aid translators who may wish 488 # to skip translating some unimportant docstrings. 489 isdocstring = any(v.values()) 490 # k is the message string, v is a dictionary-set of (filename, 491 # lineno) tuples. We want to sort the entries in v first by 492 # file name and then by line number. 493 v = sorted(v.keys()) 494 if not options.writelocations: 495 pass 496 # location comments are different b/w Solaris and GNU: 497 elif options.locationstyle == options.SOLARIS: 498 for filename, lineno in v: 499 d = {'filename': filename, 'lineno': lineno} 500 print(_( 501 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 502 elif options.locationstyle == options.GNU: 503 # fit as many locations on one line, as long as the 504 # resulting line length doesn't exceed 'options.width' 505 locline = '#:' 506 for filename, lineno in v: 507 d = {'filename': filename, 'lineno': lineno} 508 s = _(' %(filename)s:%(lineno)d') % d 509 if len(locline) + len(s) <= options.width: 510 locline = locline + s 511 else: 512 print(locline, file=fp) 513 locline = "#:" + s 514 if len(locline) > 2: 515 print(locline, file=fp) 516 if isdocstring: 517 print('#, docstring', file=fp) 518 print('msgid', normalize(k, encoding), file=fp) 519 print('msgstr ""\n', file=fp) 520 521 522def main(): 523 global default_keywords 524 try: 525 opts, args = getopt.getopt( 526 sys.argv[1:], 527 'ad:DEhk:Kno:p:S:Vvw:x:X:', 528 ['extract-all', 'default-domain=', 'escape', 'help', 529 'keyword=', 'no-default-keywords', 530 'add-location', 'no-location', 'output=', 'output-dir=', 531 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 532 'docstrings', 'no-docstrings', 533 ]) 534 except getopt.error as msg: 535 usage(1, msg) 536 537 # for holding option values 538 class Options: 539 # constants 540 GNU = 1 541 SOLARIS = 2 542 # defaults 543 extractall = 0 # FIXME: currently this option has no effect at all. 544 escape = 0 545 keywords = [] 546 outpath = '' 547 outfile = 'messages.pot' 548 writelocations = 1 549 locationstyle = GNU 550 verbose = 0 551 width = 78 552 excludefilename = '' 553 docstrings = 0 554 nodocstrings = {} 555 556 options = Options() 557 locations = {'gnu' : options.GNU, 558 'solaris' : options.SOLARIS, 559 } 560 561 # parse options 562 for opt, arg in opts: 563 if opt in ('-h', '--help'): 564 usage(0) 565 elif opt in ('-a', '--extract-all'): 566 options.extractall = 1 567 elif opt in ('-d', '--default-domain'): 568 options.outfile = arg + '.pot' 569 elif opt in ('-E', '--escape'): 570 options.escape = 1 571 elif opt in ('-D', '--docstrings'): 572 options.docstrings = 1 573 elif opt in ('-k', '--keyword'): 574 options.keywords.append(arg) 575 elif opt in ('-K', '--no-default-keywords'): 576 default_keywords = [] 577 elif opt in ('-n', '--add-location'): 578 options.writelocations = 1 579 elif opt in ('--no-location',): 580 options.writelocations = 0 581 elif opt in ('-S', '--style'): 582 options.locationstyle = locations.get(arg.lower()) 583 if options.locationstyle is None: 584 usage(1, _('Invalid value for --style: %s') % arg) 585 elif opt in ('-o', '--output'): 586 options.outfile = arg 587 elif opt in ('-p', '--output-dir'): 588 options.outpath = arg 589 elif opt in ('-v', '--verbose'): 590 options.verbose = 1 591 elif opt in ('-V', '--version'): 592 print(_('pygettext.py (xgettext for Python) %s') % __version__) 593 sys.exit(0) 594 elif opt in ('-w', '--width'): 595 try: 596 options.width = int(arg) 597 except ValueError: 598 usage(1, _('--width argument must be an integer: %s') % arg) 599 elif opt in ('-x', '--exclude-file'): 600 options.excludefilename = arg 601 elif opt in ('-X', '--no-docstrings'): 602 fp = open(arg) 603 try: 604 while 1: 605 line = fp.readline() 606 if not line: 607 break 608 options.nodocstrings[line[:-1]] = 1 609 finally: 610 fp.close() 611 612 # calculate escapes 613 make_escapes(not options.escape) 614 615 # calculate all keywords 616 options.keywords.extend(default_keywords) 617 618 # initialize list of strings to exclude 619 if options.excludefilename: 620 try: 621 with open(options.excludefilename) as fp: 622 options.toexclude = fp.readlines() 623 except IOError: 624 print(_( 625 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 626 sys.exit(1) 627 else: 628 options.toexclude = [] 629 630 # resolve args to module lists 631 expanded = [] 632 for arg in args: 633 if arg == '-': 634 expanded.append(arg) 635 else: 636 expanded.extend(getFilesForName(arg)) 637 args = expanded 638 639 # slurp through all the files 640 eater = TokenEater(options) 641 for filename in args: 642 if filename == '-': 643 if options.verbose: 644 print(_('Reading standard input')) 645 fp = sys.stdin.buffer 646 closep = 0 647 else: 648 if options.verbose: 649 print(_('Working on %s') % filename) 650 fp = open(filename, 'rb') 651 closep = 1 652 try: 653 eater.set_filename(filename) 654 try: 655 tokens = tokenize.tokenize(fp.readline) 656 for _token in tokens: 657 eater(*_token) 658 except tokenize.TokenError as e: 659 print('%s: %s, line %d, column %d' % ( 660 e.args[0], filename, e.args[1][0], e.args[1][1]), 661 file=sys.stderr) 662 finally: 663 if closep: 664 fp.close() 665 666 # write the output 667 if options.outfile == '-': 668 fp = sys.stdout 669 closep = 0 670 else: 671 if options.outpath: 672 options.outfile = os.path.join(options.outpath, options.outfile) 673 fp = open(options.outfile, 'w') 674 closep = 1 675 try: 676 eater.write(fp) 677 finally: 678 if closep: 679 fp.close() 680 681 682if __name__ == '__main__': 683 main() 684 # some more test strings 685 # this one creates a warning 686 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 687 _('more' 'than' 'one' 'string') 688