1#! /usr/bin/env python3 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@python.org> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import importlib.machinery 160import importlib.util 161import sys 162import glob 163import time 164import getopt 165import token 166import tokenize 167 168__version__ = '1.5' 169 170default_keywords = ['_'] 171DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173EMPTYSTRING = '' 174 175 176 177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 178# there. 179pot_header = _('''\ 180# SOME DESCRIPTIVE TITLE. 181# Copyright (C) YEAR ORGANIZATION 182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 183# 184msgid "" 185msgstr "" 186"Project-Id-Version: PACKAGE VERSION\\n" 187"POT-Creation-Date: %(time)s\\n" 188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 190"Language-Team: LANGUAGE <LL@li.org>\\n" 191"MIME-Version: 1.0\\n" 192"Content-Type: text/plain; charset=%(charset)s\\n" 193"Content-Transfer-Encoding: %(encoding)s\\n" 194"Generated-By: pygettext.py %(version)s\\n" 195 196''') 197 198 199def usage(code, msg=''): 200 print(__doc__ % globals(), file=sys.stderr) 201 if msg: 202 print(msg, file=sys.stderr) 203 sys.exit(code) 204 205 206 207def make_escapes(pass_nonascii): 208 global escapes, escape 209 if pass_nonascii: 210 # Allow non-ascii characters to pass through so that e.g. 'msgid 211 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 212 # escape any character outside the 32..126 range. 213 mod = 128 214 escape = escape_ascii 215 else: 216 mod = 256 217 escape = escape_nonascii 218 escapes = [r"\%03o" % i for i in range(mod)] 219 for i in range(32, 127): 220 escapes[i] = chr(i) 221 escapes[ord('\\')] = r'\\' 222 escapes[ord('\t')] = r'\t' 223 escapes[ord('\r')] = r'\r' 224 escapes[ord('\n')] = r'\n' 225 escapes[ord('\"')] = r'\"' 226 227 228def escape_ascii(s, encoding): 229 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 230 231def escape_nonascii(s, encoding): 232 return ''.join(escapes[b] for b in s.encode(encoding)) 233 234 235def is_literal_string(s): 236 return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') 237 238 239def safe_eval(s): 240 # unwrap quotes, safely 241 return eval(s, {'__builtins__':{}}, {}) 242 243 244def normalize(s, encoding): 245 # This converts the various Python string types into a format that is 246 # appropriate for .po files, namely much closer to C style. 247 lines = s.split('\n') 248 if len(lines) == 1: 249 s = '"' + escape(s, encoding) + '"' 250 else: 251 if not lines[-1]: 252 del lines[-1] 253 lines[-1] = lines[-1] + '\n' 254 for i in range(len(lines)): 255 lines[i] = escape(lines[i], encoding) 256 lineterm = '\\n"\n"' 257 s = '""\n"' + lineterm.join(lines) + '"' 258 return s 259 260 261def containsAny(str, set): 262 """Check whether 'str' contains ANY of the chars in 'set'""" 263 return 1 in [c in str for c in set] 264 265 266def getFilesForName(name): 267 """Get a list of module files for a filename, a module or package name, 268 or a directory. 269 """ 270 if not os.path.exists(name): 271 # check for glob chars 272 if containsAny(name, "*?[]"): 273 files = glob.glob(name) 274 list = [] 275 for file in files: 276 list.extend(getFilesForName(file)) 277 return list 278 279 # try to find module or package 280 try: 281 spec = importlib.util.find_spec(name) 282 name = spec.origin 283 except ImportError: 284 name = None 285 if not name: 286 return [] 287 288 if os.path.isdir(name): 289 # find all python files in directory 290 list = [] 291 # get extension for python source files 292 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 293 for root, dirs, files in os.walk(name): 294 # don't recurse into CVS directories 295 if 'CVS' in dirs: 296 dirs.remove('CVS') 297 # add all *.py files to list 298 list.extend( 299 [os.path.join(root, file) for file in files 300 if os.path.splitext(file)[1] == _py_ext] 301 ) 302 return list 303 elif os.path.exists(name): 304 # a single file 305 return [name] 306 307 return [] 308 309 310class TokenEater: 311 def __init__(self, options): 312 self.__options = options 313 self.__messages = {} 314 self.__state = self.__waiting 315 self.__data = [] 316 self.__lineno = -1 317 self.__freshmodule = 1 318 self.__curfile = None 319 self.__enclosurecount = 0 320 321 def __call__(self, ttype, tstring, stup, etup, line): 322 # dispatch 323## import token 324## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, 325## file=sys.stderr) 326 self.__state(ttype, tstring, stup[0]) 327 328 def __waiting(self, ttype, tstring, lineno): 329 opts = self.__options 330 # Do docstring extractions, if enabled 331 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 332 # module docstring? 333 if self.__freshmodule: 334 if ttype == tokenize.STRING and is_literal_string(tstring): 335 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 336 self.__freshmodule = 0 337 elif ttype not in (tokenize.COMMENT, tokenize.NL): 338 self.__freshmodule = 0 339 return 340 # class or func/method docstring? 341 if ttype == tokenize.NAME and tstring in ('class', 'def'): 342 self.__state = self.__suiteseen 343 return 344 if ttype == tokenize.NAME and tstring in opts.keywords: 345 self.__state = self.__keywordseen 346 347 def __suiteseen(self, ttype, tstring, lineno): 348 # skip over any enclosure pairs until we see the colon 349 if ttype == tokenize.OP: 350 if tstring == ':' and self.__enclosurecount == 0: 351 # we see a colon and we're not in an enclosure: end of def 352 self.__state = self.__suitedocstring 353 elif tstring in '([{': 354 self.__enclosurecount += 1 355 elif tstring in ')]}': 356 self.__enclosurecount -= 1 357 358 def __suitedocstring(self, ttype, tstring, lineno): 359 # ignore any intervening noise 360 if ttype == tokenize.STRING and is_literal_string(tstring): 361 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 362 self.__state = self.__waiting 363 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 364 tokenize.COMMENT): 365 # there was no class docstring 366 self.__state = self.__waiting 367 368 def __keywordseen(self, ttype, tstring, lineno): 369 if ttype == tokenize.OP and tstring == '(': 370 self.__data = [] 371 self.__lineno = lineno 372 self.__state = self.__openseen 373 else: 374 self.__state = self.__waiting 375 376 def __openseen(self, ttype, tstring, lineno): 377 if ttype == tokenize.OP and tstring == ')': 378 # We've seen the last of the translatable strings. Record the 379 # line number of the first line of the strings and update the list 380 # of messages seen. Reset state for the next batch. If there 381 # were no strings inside _(), then just ignore this entry. 382 if self.__data: 383 self.__addentry(EMPTYSTRING.join(self.__data)) 384 self.__state = self.__waiting 385 elif ttype == tokenize.STRING and is_literal_string(tstring): 386 self.__data.append(safe_eval(tstring)) 387 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 388 token.NEWLINE, tokenize.NL]: 389 # warn if we see anything else than STRING or whitespace 390 print(_( 391 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 392 ) % { 393 'token': tstring, 394 'file': self.__curfile, 395 'lineno': self.__lineno 396 }, file=sys.stderr) 397 self.__state = self.__waiting 398 399 def __addentry(self, msg, lineno=None, isdocstring=0): 400 if lineno is None: 401 lineno = self.__lineno 402 if not msg in self.__options.toexclude: 403 entry = (self.__curfile, lineno) 404 self.__messages.setdefault(msg, {})[entry] = isdocstring 405 406 def set_filename(self, filename): 407 self.__curfile = filename 408 self.__freshmodule = 1 409 410 def write(self, fp): 411 options = self.__options 412 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 413 encoding = fp.encoding if fp.encoding else 'UTF-8' 414 print(pot_header % {'time': timestamp, 'version': __version__, 415 'charset': encoding, 416 'encoding': '8bit'}, file=fp) 417 # Sort the entries. First sort each particular entry's keys, then 418 # sort all the entries by their first item. 419 reverse = {} 420 for k, v in self.__messages.items(): 421 keys = sorted(v.keys()) 422 reverse.setdefault(tuple(keys), []).append((k, v)) 423 rkeys = sorted(reverse.keys()) 424 for rkey in rkeys: 425 rentries = reverse[rkey] 426 rentries.sort() 427 for k, v in rentries: 428 # If the entry was gleaned out of a docstring, then add a 429 # comment stating so. This is to aid translators who may wish 430 # to skip translating some unimportant docstrings. 431 isdocstring = any(v.values()) 432 # k is the message string, v is a dictionary-set of (filename, 433 # lineno) tuples. We want to sort the entries in v first by 434 # file name and then by line number. 435 v = sorted(v.keys()) 436 if not options.writelocations: 437 pass 438 # location comments are different b/w Solaris and GNU: 439 elif options.locationstyle == options.SOLARIS: 440 for filename, lineno in v: 441 d = {'filename': filename, 'lineno': lineno} 442 print(_( 443 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 444 elif options.locationstyle == options.GNU: 445 # fit as many locations on one line, as long as the 446 # resulting line length doesn't exceed 'options.width' 447 locline = '#:' 448 for filename, lineno in v: 449 d = {'filename': filename, 'lineno': lineno} 450 s = _(' %(filename)s:%(lineno)d') % d 451 if len(locline) + len(s) <= options.width: 452 locline = locline + s 453 else: 454 print(locline, file=fp) 455 locline = "#:" + s 456 if len(locline) > 2: 457 print(locline, file=fp) 458 if isdocstring: 459 print('#, docstring', file=fp) 460 print('msgid', normalize(k, encoding), file=fp) 461 print('msgstr ""\n', file=fp) 462 463 464 465def main(): 466 global default_keywords 467 try: 468 opts, args = getopt.getopt( 469 sys.argv[1:], 470 'ad:DEhk:Kno:p:S:Vvw:x:X:', 471 ['extract-all', 'default-domain=', 'escape', 'help', 472 'keyword=', 'no-default-keywords', 473 'add-location', 'no-location', 'output=', 'output-dir=', 474 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 475 'docstrings', 'no-docstrings', 476 ]) 477 except getopt.error as msg: 478 usage(1, msg) 479 480 # for holding option values 481 class Options: 482 # constants 483 GNU = 1 484 SOLARIS = 2 485 # defaults 486 extractall = 0 # FIXME: currently this option has no effect at all. 487 escape = 0 488 keywords = [] 489 outpath = '' 490 outfile = 'messages.pot' 491 writelocations = 1 492 locationstyle = GNU 493 verbose = 0 494 width = 78 495 excludefilename = '' 496 docstrings = 0 497 nodocstrings = {} 498 499 options = Options() 500 locations = {'gnu' : options.GNU, 501 'solaris' : options.SOLARIS, 502 } 503 504 # parse options 505 for opt, arg in opts: 506 if opt in ('-h', '--help'): 507 usage(0) 508 elif opt in ('-a', '--extract-all'): 509 options.extractall = 1 510 elif opt in ('-d', '--default-domain'): 511 options.outfile = arg + '.pot' 512 elif opt in ('-E', '--escape'): 513 options.escape = 1 514 elif opt in ('-D', '--docstrings'): 515 options.docstrings = 1 516 elif opt in ('-k', '--keyword'): 517 options.keywords.append(arg) 518 elif opt in ('-K', '--no-default-keywords'): 519 default_keywords = [] 520 elif opt in ('-n', '--add-location'): 521 options.writelocations = 1 522 elif opt in ('--no-location',): 523 options.writelocations = 0 524 elif opt in ('-S', '--style'): 525 options.locationstyle = locations.get(arg.lower()) 526 if options.locationstyle is None: 527 usage(1, _('Invalid value for --style: %s') % arg) 528 elif opt in ('-o', '--output'): 529 options.outfile = arg 530 elif opt in ('-p', '--output-dir'): 531 options.outpath = arg 532 elif opt in ('-v', '--verbose'): 533 options.verbose = 1 534 elif opt in ('-V', '--version'): 535 print(_('pygettext.py (xgettext for Python) %s') % __version__) 536 sys.exit(0) 537 elif opt in ('-w', '--width'): 538 try: 539 options.width = int(arg) 540 except ValueError: 541 usage(1, _('--width argument must be an integer: %s') % arg) 542 elif opt in ('-x', '--exclude-file'): 543 options.excludefilename = arg 544 elif opt in ('-X', '--no-docstrings'): 545 fp = open(arg) 546 try: 547 while 1: 548 line = fp.readline() 549 if not line: 550 break 551 options.nodocstrings[line[:-1]] = 1 552 finally: 553 fp.close() 554 555 # calculate escapes 556 make_escapes(not options.escape) 557 558 # calculate all keywords 559 options.keywords.extend(default_keywords) 560 561 # initialize list of strings to exclude 562 if options.excludefilename: 563 try: 564 with open(options.excludefilename) as fp: 565 options.toexclude = fp.readlines() 566 except IOError: 567 print(_( 568 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 569 sys.exit(1) 570 else: 571 options.toexclude = [] 572 573 # resolve args to module lists 574 expanded = [] 575 for arg in args: 576 if arg == '-': 577 expanded.append(arg) 578 else: 579 expanded.extend(getFilesForName(arg)) 580 args = expanded 581 582 # slurp through all the files 583 eater = TokenEater(options) 584 for filename in args: 585 if filename == '-': 586 if options.verbose: 587 print(_('Reading standard input')) 588 fp = sys.stdin.buffer 589 closep = 0 590 else: 591 if options.verbose: 592 print(_('Working on %s') % filename) 593 fp = open(filename, 'rb') 594 closep = 1 595 try: 596 eater.set_filename(filename) 597 try: 598 tokens = tokenize.tokenize(fp.readline) 599 for _token in tokens: 600 eater(*_token) 601 except tokenize.TokenError as e: 602 print('%s: %s, line %d, column %d' % ( 603 e.args[0], filename, e.args[1][0], e.args[1][1]), 604 file=sys.stderr) 605 finally: 606 if closep: 607 fp.close() 608 609 # write the output 610 if options.outfile == '-': 611 fp = sys.stdout 612 closep = 0 613 else: 614 if options.outpath: 615 options.outfile = os.path.join(options.outpath, options.outfile) 616 fp = open(options.outfile, 'w') 617 closep = 1 618 try: 619 eater.write(fp) 620 finally: 621 if closep: 622 fp.close() 623 624 625if __name__ == '__main__': 626 main() 627 # some more test strings 628 # this one creates a warning 629 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 630 _('more' 'than' 'one' 'string') 631