1#! /usr/bin/env python 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@zope.com> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import imp 160import sys 161import glob 162import time 163import getopt 164import token 165import tokenize 166import operator 167 168__version__ = '1.5' 169 170default_keywords = ['_'] 171DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173EMPTYSTRING = '' 174 175 176 177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 178# there. 179pot_header = _('''\ 180# SOME DESCRIPTIVE TITLE. 181# Copyright (C) YEAR ORGANIZATION 182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 183# 184msgid "" 185msgstr "" 186"Project-Id-Version: PACKAGE VERSION\\n" 187"POT-Creation-Date: %(time)s\\n" 188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 190"Language-Team: LANGUAGE <LL@li.org>\\n" 191"MIME-Version: 1.0\\n" 192"Content-Type: text/plain; charset=CHARSET\\n" 193"Content-Transfer-Encoding: ENCODING\\n" 194"Generated-By: pygettext.py %(version)s\\n" 195 196''') 197 198 199def usage(code, msg=''): 200 print >> sys.stderr, __doc__ % globals() 201 if msg: 202 print >> sys.stderr, msg 203 sys.exit(code) 204 205 206 207escapes = [] 208 209def make_escapes(pass_iso8859): 210 global escapes 211 if pass_iso8859: 212 # Allow iso-8859 characters to pass through so that e.g. 'msgid 213 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 214 # escape any character outside the 32..126 range. 215 mod = 128 216 else: 217 mod = 256 218 for i in range(256): 219 if 32 <= (i % mod) <= 126: 220 escapes.append(chr(i)) 221 else: 222 escapes.append("\\%03o" % i) 223 escapes[ord('\\')] = '\\\\' 224 escapes[ord('\t')] = '\\t' 225 escapes[ord('\r')] = '\\r' 226 escapes[ord('\n')] = '\\n' 227 escapes[ord('\"')] = '\\"' 228 229 230def escape(s): 231 global escapes 232 s = list(s) 233 for i in range(len(s)): 234 s[i] = escapes[ord(s[i])] 235 return EMPTYSTRING.join(s) 236 237 238def safe_eval(s): 239 # unwrap quotes, safely 240 return eval(s, {'__builtins__':{}}, {}) 241 242 243def normalize(s): 244 # This converts the various Python string types into a format that is 245 # appropriate for .po files, namely much closer to C style. 246 lines = s.split('\n') 247 if len(lines) == 1: 248 s = '"' + escape(s) + '"' 249 else: 250 if not lines[-1]: 251 del lines[-1] 252 lines[-1] = lines[-1] + '\n' 253 for i in range(len(lines)): 254 lines[i] = escape(lines[i]) 255 lineterm = '\\n"\n"' 256 s = '""\n"' + lineterm.join(lines) + '"' 257 return s 258 259 260def containsAny(str, set): 261 """Check whether 'str' contains ANY of the chars in 'set'""" 262 return 1 in [c in str for c in set] 263 264 265def _visit_pyfiles(list, dirname, names): 266 """Helper for getFilesForName().""" 267 # get extension for python source files 268 if not globals().has_key('_py_ext'): 269 global _py_ext 270 _py_ext = [triple[0] for triple in imp.get_suffixes() 271 if triple[2] == imp.PY_SOURCE][0] 272 273 # don't recurse into CVS directories 274 if 'CVS' in names: 275 names.remove('CVS') 276 277 # add all *.py files to list 278 list.extend( 279 [os.path.join(dirname, file) for file in names 280 if os.path.splitext(file)[1] == _py_ext] 281 ) 282 283 284def _get_modpkg_path(dotted_name, pathlist=None): 285 """Get the filesystem path for a module or a package. 286 287 Return the file system path to a file for a module, and to a directory for 288 a package. Return None if the name is not found, or is a builtin or 289 extension module. 290 """ 291 # split off top-most name 292 parts = dotted_name.split('.', 1) 293 294 if len(parts) > 1: 295 # we have a dotted path, import top-level package 296 try: 297 file, pathname, description = imp.find_module(parts[0], pathlist) 298 if file: file.close() 299 except ImportError: 300 return None 301 302 # check if it's indeed a package 303 if description[2] == imp.PKG_DIRECTORY: 304 # recursively handle the remaining name parts 305 pathname = _get_modpkg_path(parts[1], [pathname]) 306 else: 307 pathname = None 308 else: 309 # plain name 310 try: 311 file, pathname, description = imp.find_module( 312 dotted_name, pathlist) 313 if file: 314 file.close() 315 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: 316 pathname = None 317 except ImportError: 318 pathname = None 319 320 return pathname 321 322 323def getFilesForName(name): 324 """Get a list of module files for a filename, a module or package name, 325 or a directory. 326 """ 327 if not os.path.exists(name): 328 # check for glob chars 329 if containsAny(name, "*?[]"): 330 files = glob.glob(name) 331 list = [] 332 for file in files: 333 list.extend(getFilesForName(file)) 334 return list 335 336 # try to find module or package 337 name = _get_modpkg_path(name) 338 if not name: 339 return [] 340 341 if os.path.isdir(name): 342 # find all python files in directory 343 list = [] 344 os.path.walk(name, _visit_pyfiles, list) 345 return list 346 elif os.path.exists(name): 347 # a single file 348 return [name] 349 350 return [] 351 352 353class TokenEater: 354 def __init__(self, options): 355 self.__options = options 356 self.__messages = {} 357 self.__state = self.__waiting 358 self.__data = [] 359 self.__lineno = -1 360 self.__freshmodule = 1 361 self.__curfile = None 362 363 def __call__(self, ttype, tstring, stup, etup, line): 364 # dispatch 365## import token 366## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 367## 'tstring:', tstring 368 self.__state(ttype, tstring, stup[0]) 369 370 def __waiting(self, ttype, tstring, lineno): 371 opts = self.__options 372 # Do docstring extractions, if enabled 373 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 374 # module docstring? 375 if self.__freshmodule: 376 if ttype == tokenize.STRING: 377 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 378 self.__freshmodule = 0 379 elif ttype not in (tokenize.COMMENT, tokenize.NL): 380 self.__freshmodule = 0 381 return 382 # class docstring? 383 if ttype == tokenize.NAME and tstring in ('class', 'def'): 384 self.__state = self.__suiteseen 385 return 386 if ttype == tokenize.NAME and tstring in opts.keywords: 387 self.__state = self.__keywordseen 388 389 def __suiteseen(self, ttype, tstring, lineno): 390 # ignore anything until we see the colon 391 if ttype == tokenize.OP and tstring == ':': 392 self.__state = self.__suitedocstring 393 394 def __suitedocstring(self, ttype, tstring, lineno): 395 # ignore any intervening noise 396 if ttype == tokenize.STRING: 397 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 398 self.__state = self.__waiting 399 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 400 tokenize.COMMENT): 401 # there was no class docstring 402 self.__state = self.__waiting 403 404 def __keywordseen(self, ttype, tstring, lineno): 405 if ttype == tokenize.OP and tstring == '(': 406 self.__data = [] 407 self.__lineno = lineno 408 self.__state = self.__openseen 409 else: 410 self.__state = self.__waiting 411 412 def __openseen(self, ttype, tstring, lineno): 413 if ttype == tokenize.OP and tstring == ')': 414 # We've seen the last of the translatable strings. Record the 415 # line number of the first line of the strings and update the list 416 # of messages seen. Reset state for the next batch. If there 417 # were no strings inside _(), then just ignore this entry. 418 if self.__data: 419 self.__addentry(EMPTYSTRING.join(self.__data)) 420 self.__state = self.__waiting 421 elif ttype == tokenize.STRING: 422 self.__data.append(safe_eval(tstring)) 423 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 424 token.NEWLINE, tokenize.NL]: 425 # warn if we see anything else than STRING or whitespace 426 print >> sys.stderr, _( 427 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 428 ) % { 429 'token': tstring, 430 'file': self.__curfile, 431 'lineno': self.__lineno 432 } 433 self.__state = self.__waiting 434 435 def __addentry(self, msg, lineno=None, isdocstring=0): 436 if lineno is None: 437 lineno = self.__lineno 438 if not msg in self.__options.toexclude: 439 entry = (self.__curfile, lineno) 440 self.__messages.setdefault(msg, {})[entry] = isdocstring 441 442 def set_filename(self, filename): 443 self.__curfile = filename 444 self.__freshmodule = 1 445 446 def write(self, fp): 447 options = self.__options 448 timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') 449 # The time stamp in the header doesn't have the same format as that 450 # generated by xgettext... 451 print >> fp, pot_header % {'time': timestamp, 'version': __version__} 452 # Sort the entries. First sort each particular entry's keys, then 453 # sort all the entries by their first item. 454 reverse = {} 455 for k, v in self.__messages.items(): 456 keys = v.keys() 457 keys.sort() 458 reverse.setdefault(tuple(keys), []).append((k, v)) 459 rkeys = reverse.keys() 460 rkeys.sort() 461 for rkey in rkeys: 462 rentries = reverse[rkey] 463 rentries.sort() 464 for k, v in rentries: 465 isdocstring = 0 466 # If the entry was gleaned out of a docstring, then add a 467 # comment stating so. This is to aid translators who may wish 468 # to skip translating some unimportant docstrings. 469 if reduce(operator.__add__, v.values()): 470 isdocstring = 1 471 # k is the message string, v is a dictionary-set of (filename, 472 # lineno) tuples. We want to sort the entries in v first by 473 # file name and then by line number. 474 v = v.keys() 475 v.sort() 476 if not options.writelocations: 477 pass 478 # location comments are different b/w Solaris and GNU: 479 elif options.locationstyle == options.SOLARIS: 480 for filename, lineno in v: 481 d = {'filename': filename, 'lineno': lineno} 482 print >>fp, _( 483 '# File: %(filename)s, line: %(lineno)d') % d 484 elif options.locationstyle == options.GNU: 485 # fit as many locations on one line, as long as the 486 # resulting line length doesn't exceeds 'options.width' 487 locline = '#:' 488 for filename, lineno in v: 489 d = {'filename': filename, 'lineno': lineno} 490 s = _(' %(filename)s:%(lineno)d') % d 491 if len(locline) + len(s) <= options.width: 492 locline = locline + s 493 else: 494 print >> fp, locline 495 locline = "#:" + s 496 if len(locline) > 2: 497 print >> fp, locline 498 if isdocstring: 499 print >> fp, '#, docstring' 500 print >> fp, 'msgid', normalize(k) 501 print >> fp, 'msgstr ""\n' 502 503 504 505def main(): 506 global default_keywords 507 try: 508 opts, args = getopt.getopt( 509 sys.argv[1:], 510 'ad:DEhk:Kno:p:S:Vvw:x:X:', 511 ['extract-all', 'default-domain=', 'escape', 'help', 512 'keyword=', 'no-default-keywords', 513 'add-location', 'no-location', 'output=', 'output-dir=', 514 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 515 'docstrings', 'no-docstrings', 516 ]) 517 except getopt.error, msg: 518 usage(1, msg) 519 520 # for holding option values 521 class Options: 522 # constants 523 GNU = 1 524 SOLARIS = 2 525 # defaults 526 extractall = 0 # FIXME: currently this option has no effect at all. 527 escape = 0 528 keywords = [] 529 outpath = '' 530 outfile = 'messages.pot' 531 writelocations = 1 532 locationstyle = GNU 533 verbose = 0 534 width = 78 535 excludefilename = '' 536 docstrings = 0 537 nodocstrings = {} 538 539 options = Options() 540 locations = {'gnu' : options.GNU, 541 'solaris' : options.SOLARIS, 542 } 543 544 # parse options 545 for opt, arg in opts: 546 if opt in ('-h', '--help'): 547 usage(0) 548 elif opt in ('-a', '--extract-all'): 549 options.extractall = 1 550 elif opt in ('-d', '--default-domain'): 551 options.outfile = arg + '.pot' 552 elif opt in ('-E', '--escape'): 553 options.escape = 1 554 elif opt in ('-D', '--docstrings'): 555 options.docstrings = 1 556 elif opt in ('-k', '--keyword'): 557 options.keywords.append(arg) 558 elif opt in ('-K', '--no-default-keywords'): 559 default_keywords = [] 560 elif opt in ('-n', '--add-location'): 561 options.writelocations = 1 562 elif opt in ('--no-location',): 563 options.writelocations = 0 564 elif opt in ('-S', '--style'): 565 options.locationstyle = locations.get(arg.lower()) 566 if options.locationstyle is None: 567 usage(1, _('Invalid value for --style: %s') % arg) 568 elif opt in ('-o', '--output'): 569 options.outfile = arg 570 elif opt in ('-p', '--output-dir'): 571 options.outpath = arg 572 elif opt in ('-v', '--verbose'): 573 options.verbose = 1 574 elif opt in ('-V', '--version'): 575 print _('pygettext.py (xgettext for Python) %s') % __version__ 576 sys.exit(0) 577 elif opt in ('-w', '--width'): 578 try: 579 options.width = int(arg) 580 except ValueError: 581 usage(1, _('--width argument must be an integer: %s') % arg) 582 elif opt in ('-x', '--exclude-file'): 583 options.excludefilename = arg 584 elif opt in ('-X', '--no-docstrings'): 585 fp = open(arg) 586 try: 587 while 1: 588 line = fp.readline() 589 if not line: 590 break 591 options.nodocstrings[line[:-1]] = 1 592 finally: 593 fp.close() 594 595 # calculate escapes 596 make_escapes(options.escape) 597 598 # calculate all keywords 599 options.keywords.extend(default_keywords) 600 601 # initialize list of strings to exclude 602 if options.excludefilename: 603 try: 604 fp = open(options.excludefilename) 605 options.toexclude = fp.readlines() 606 fp.close() 607 except IOError: 608 print >> sys.stderr, _( 609 "Can't read --exclude-file: %s") % options.excludefilename 610 sys.exit(1) 611 else: 612 options.toexclude = [] 613 614 # resolve args to module lists 615 expanded = [] 616 for arg in args: 617 if arg == '-': 618 expanded.append(arg) 619 else: 620 expanded.extend(getFilesForName(arg)) 621 args = expanded 622 623 # slurp through all the files 624 eater = TokenEater(options) 625 for filename in args: 626 if filename == '-': 627 if options.verbose: 628 print _('Reading standard input') 629 fp = sys.stdin 630 closep = 0 631 else: 632 if options.verbose: 633 print _('Working on %s') % filename 634 fp = open(filename) 635 closep = 1 636 try: 637 eater.set_filename(filename) 638 try: 639 tokenize.tokenize(fp.readline, eater) 640 except tokenize.TokenError, e: 641 print >> sys.stderr, '%s: %s, line %d, column %d' % ( 642 e[0], filename, e[1][0], e[1][1]) 643 finally: 644 if closep: 645 fp.close() 646 647 # write the output 648 if options.outfile == '-': 649 fp = sys.stdout 650 closep = 0 651 else: 652 if options.outpath: 653 options.outfile = os.path.join(options.outpath, options.outfile) 654 fp = open(options.outfile, 'w') 655 closep = 1 656 try: 657 eater.write(fp) 658 finally: 659 if closep: 660 fp.close() 661 662 663if __name__ == '__main__': 664 main() 665 # some more test strings 666 _(u'a unicode string') 667 # this one creates a warning 668 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 669 _('more' 'than' 'one' 'string') 670