1#! /usr/bin/env python 2# -*- coding: iso-8859-1 -*- 3# Originally written by Barry Warsaw <barry@python.org> 4# 5# Minimally patched to make it even more xgettext compatible 6# by Peter Funk <pf@artcom-gmbh.de> 7# 8# 2002-11-22 J�rgen Hermann <jh@web.de> 9# Added checks that _() only contains string literals, and 10# command line args are resolved to module lists, i.e. you 11# can now pass a filename, a module or package name, or a 12# directory (including globbing chars, important for Win32). 13# Made docstring fit in 80 chars wide displays using pydoc. 14# 15 16# for selftesting 17try: 18 import fintl 19 _ = fintl.gettext 20except ImportError: 21 _ = lambda s: s 22 23__doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26internationalization of C programs. Most of these tools are independent of 27the programming language and can be used from within Python programs. 28Martin von Loewis' work[1] helps considerably in this regard. 29 30There's one problem though; xgettext is the program that scans source code 31looking for message strings, but it groks only C (or C++). Python 32introduces a few wrinkles, such as dual quoting characters, triple quoted 33strings, and raw strings. xgettext understands none of this. 34 35Enter pygettext, which uses Python's standard tokenize module to scan 36Python source code, generating .pot files identical to what GNU xgettext[2] 37generates for C and C++ code. From there, the standard GNU tools can be 38used. 39 40A word about marking Python strings as candidates for translation. GNU 41xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42and gettext_noop. But those can be a lot of text to include all over your 43code. C and C++ have a trick: they use the C preprocessor. Most 44internationalized C source includes a #define for gettext() to _() so that 45what has to be written in the source is much less. Thus these are both 46translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51Python of course has no preprocessor so this doesn't work so well. Thus, 52pygettext searches only for _() by default, but see the -k/--keyword flag 53below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58NOTE: pygettext attempts to be option and feature compatible with GNU 59xgettext where ever possible. However some options are still missing or are 60not fully implemented. Also, xgettext's use of command line switches with 61option arguments is broken, and in these cases, pygettext just defines 62additional switches. 63 64Usage: pygettext [options] inputfile ... 65 66Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155If `inputfile' is -, standard input is read. 156""") 157 158import os 159import imp 160import sys 161import glob 162import time 163import getopt 164import token 165import tokenize 166import operator 167 168__version__ = '1.5' 169 170default_keywords = ['_'] 171DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173EMPTYSTRING = '' 174 175 176 177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 178# there. 179pot_header = _('''\ 180# SOME DESCRIPTIVE TITLE. 181# Copyright (C) YEAR ORGANIZATION 182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 183# 184msgid "" 185msgstr "" 186"Project-Id-Version: PACKAGE VERSION\\n" 187"POT-Creation-Date: %(time)s\\n" 188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 190"Language-Team: LANGUAGE <LL@li.org>\\n" 191"MIME-Version: 1.0\\n" 192"Content-Type: text/plain; charset=CHARSET\\n" 193"Content-Transfer-Encoding: ENCODING\\n" 194"Generated-By: pygettext.py %(version)s\\n" 195 196''') 197 198 199def usage(code, msg=''): 200 print >> sys.stderr, __doc__ % globals() 201 if msg: 202 print >> sys.stderr, msg 203 sys.exit(code) 204 205 206 207escapes = [] 208 209def make_escapes(pass_iso8859): 210 global escapes 211 escapes = [chr(i) for i in range(256)] 212 if pass_iso8859: 213 # Allow iso-8859 characters to pass through so that e.g. 'msgid 214 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we 215 # escape any character outside the 32..126 range. 216 mod = 128 217 else: 218 mod = 256 219 for i in range(mod): 220 if not(32 <= i <= 126): 221 escapes[i] = "\\%03o" % i 222 escapes[ord('\\')] = '\\\\' 223 escapes[ord('\t')] = '\\t' 224 escapes[ord('\r')] = '\\r' 225 escapes[ord('\n')] = '\\n' 226 escapes[ord('\"')] = '\\"' 227 228 229def escape(s): 230 global escapes 231 s = list(s) 232 for i in range(len(s)): 233 s[i] = escapes[ord(s[i])] 234 return EMPTYSTRING.join(s) 235 236 237def safe_eval(s): 238 # unwrap quotes, safely 239 return eval(s, {'__builtins__':{}}, {}) 240 241 242def normalize(s): 243 # This converts the various Python string types into a format that is 244 # appropriate for .po files, namely much closer to C style. 245 lines = s.split('\n') 246 if len(lines) == 1: 247 s = '"' + escape(s) + '"' 248 else: 249 if not lines[-1]: 250 del lines[-1] 251 lines[-1] = lines[-1] + '\n' 252 for i in range(len(lines)): 253 lines[i] = escape(lines[i]) 254 lineterm = '\\n"\n"' 255 s = '""\n"' + lineterm.join(lines) + '"' 256 return s 257 258 259def containsAny(str, set): 260 """Check whether 'str' contains ANY of the chars in 'set'""" 261 return 1 in [c in str for c in set] 262 263 264def _get_modpkg_path(dotted_name, pathlist=None): 265 """Get the filesystem path for a module or a package. 266 267 Return the file system path to a file for a module, and to a directory for 268 a package. Return None if the name is not found, or is a builtin or 269 extension module. 270 """ 271 # split off top-most name 272 parts = dotted_name.split('.', 1) 273 274 if len(parts) > 1: 275 # we have a dotted path, import top-level package 276 try: 277 file, pathname, description = imp.find_module(parts[0], pathlist) 278 if file: file.close() 279 except ImportError: 280 return None 281 282 # check if it's indeed a package 283 if description[2] == imp.PKG_DIRECTORY: 284 # recursively handle the remaining name parts 285 pathname = _get_modpkg_path(parts[1], [pathname]) 286 else: 287 pathname = None 288 else: 289 # plain name 290 try: 291 file, pathname, description = imp.find_module( 292 dotted_name, pathlist) 293 if file: 294 file.close() 295 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: 296 pathname = None 297 except ImportError: 298 pathname = None 299 300 return pathname 301 302 303def getFilesForName(name): 304 """Get a list of module files for a filename, a module or package name, 305 or a directory. 306 """ 307 if not os.path.exists(name): 308 # check for glob chars 309 if containsAny(name, "*?[]"): 310 files = glob.glob(name) 311 list = [] 312 for file in files: 313 list.extend(getFilesForName(file)) 314 return list 315 316 # try to find module or package 317 name = _get_modpkg_path(name) 318 if not name: 319 return [] 320 321 if os.path.isdir(name): 322 # find all python files in directory 323 list = [] 324 # get extension for python source files 325 if '_py_ext' not in globals(): 326 global _py_ext 327 _py_ext = [triple[0] for triple in imp.get_suffixes() 328 if triple[2] == imp.PY_SOURCE][0] 329 for root, dirs, files in os.walk(name): 330 # don't recurse into CVS directories 331 if 'CVS' in dirs: 332 dirs.remove('CVS') 333 # add all *.py files to list 334 list.extend( 335 [os.path.join(root, file) for file in files 336 if os.path.splitext(file)[1] == _py_ext] 337 ) 338 return list 339 elif os.path.exists(name): 340 # a single file 341 return [name] 342 343 return [] 344 345 346class TokenEater: 347 def __init__(self, options): 348 self.__options = options 349 self.__messages = {} 350 self.__state = self.__waiting 351 self.__data = [] 352 self.__lineno = -1 353 self.__freshmodule = 1 354 self.__curfile = None 355 356 def __call__(self, ttype, tstring, stup, etup, line): 357 # dispatch 358## import token 359## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 360## 'tstring:', tstring 361 self.__state(ttype, tstring, stup[0]) 362 363 def __waiting(self, ttype, tstring, lineno): 364 opts = self.__options 365 # Do docstring extractions, if enabled 366 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 367 # module docstring? 368 if self.__freshmodule: 369 if ttype == tokenize.STRING: 370 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 371 self.__freshmodule = 0 372 elif ttype not in (tokenize.COMMENT, tokenize.NL): 373 self.__freshmodule = 0 374 return 375 # class docstring? 376 if ttype == tokenize.NAME and tstring in ('class', 'def'): 377 self.__state = self.__suiteseen 378 return 379 if ttype == tokenize.NAME and tstring in opts.keywords: 380 self.__state = self.__keywordseen 381 382 def __suiteseen(self, ttype, tstring, lineno): 383 # ignore anything until we see the colon 384 if ttype == tokenize.OP and tstring == ':': 385 self.__state = self.__suitedocstring 386 387 def __suitedocstring(self, ttype, tstring, lineno): 388 # ignore any intervening noise 389 if ttype == tokenize.STRING: 390 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 391 self.__state = self.__waiting 392 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 393 tokenize.COMMENT): 394 # there was no class docstring 395 self.__state = self.__waiting 396 397 def __keywordseen(self, ttype, tstring, lineno): 398 if ttype == tokenize.OP and tstring == '(': 399 self.__data = [] 400 self.__lineno = lineno 401 self.__state = self.__openseen 402 else: 403 self.__state = self.__waiting 404 405 def __openseen(self, ttype, tstring, lineno): 406 if ttype == tokenize.OP and tstring == ')': 407 # We've seen the last of the translatable strings. Record the 408 # line number of the first line of the strings and update the list 409 # of messages seen. Reset state for the next batch. If there 410 # were no strings inside _(), then just ignore this entry. 411 if self.__data: 412 self.__addentry(EMPTYSTRING.join(self.__data)) 413 self.__state = self.__waiting 414 elif ttype == tokenize.STRING: 415 self.__data.append(safe_eval(tstring)) 416 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 417 token.NEWLINE, tokenize.NL]: 418 # warn if we see anything else than STRING or whitespace 419 print >> sys.stderr, _( 420 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 421 ) % { 422 'token': tstring, 423 'file': self.__curfile, 424 'lineno': self.__lineno 425 } 426 self.__state = self.__waiting 427 428 def __addentry(self, msg, lineno=None, isdocstring=0): 429 if lineno is None: 430 lineno = self.__lineno 431 if not msg in self.__options.toexclude: 432 entry = (self.__curfile, lineno) 433 self.__messages.setdefault(msg, {})[entry] = isdocstring 434 435 def set_filename(self, filename): 436 self.__curfile = filename 437 self.__freshmodule = 1 438 439 def write(self, fp): 440 options = self.__options 441 timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') 442 # The time stamp in the header doesn't have the same format as that 443 # generated by xgettext... 444 print >> fp, pot_header % {'time': timestamp, 'version': __version__} 445 # Sort the entries. First sort each particular entry's keys, then 446 # sort all the entries by their first item. 447 reverse = {} 448 for k, v in self.__messages.items(): 449 keys = v.keys() 450 keys.sort() 451 reverse.setdefault(tuple(keys), []).append((k, v)) 452 rkeys = reverse.keys() 453 rkeys.sort() 454 for rkey in rkeys: 455 rentries = reverse[rkey] 456 rentries.sort() 457 for k, v in rentries: 458 isdocstring = 0 459 # If the entry was gleaned out of a docstring, then add a 460 # comment stating so. This is to aid translators who may wish 461 # to skip translating some unimportant docstrings. 462 if reduce(operator.__add__, v.values()): 463 isdocstring = 1 464 # k is the message string, v is a dictionary-set of (filename, 465 # lineno) tuples. We want to sort the entries in v first by 466 # file name and then by line number. 467 v = v.keys() 468 v.sort() 469 if not options.writelocations: 470 pass 471 # location comments are different b/w Solaris and GNU: 472 elif options.locationstyle == options.SOLARIS: 473 for filename, lineno in v: 474 d = {'filename': filename, 'lineno': lineno} 475 print >>fp, _( 476 '# File: %(filename)s, line: %(lineno)d') % d 477 elif options.locationstyle == options.GNU: 478 # fit as many locations on one line, as long as the 479 # resulting line length doesn't exceed 'options.width' 480 locline = '#:' 481 for filename, lineno in v: 482 d = {'filename': filename, 'lineno': lineno} 483 s = _(' %(filename)s:%(lineno)d') % d 484 if len(locline) + len(s) <= options.width: 485 locline = locline + s 486 else: 487 print >> fp, locline 488 locline = "#:" + s 489 if len(locline) > 2: 490 print >> fp, locline 491 if isdocstring: 492 print >> fp, '#, docstring' 493 print >> fp, 'msgid', normalize(k) 494 print >> fp, 'msgstr ""\n' 495 496 497 498def main(): 499 global default_keywords 500 try: 501 opts, args = getopt.getopt( 502 sys.argv[1:], 503 'ad:DEhk:Kno:p:S:Vvw:x:X:', 504 ['extract-all', 'default-domain=', 'escape', 'help', 505 'keyword=', 'no-default-keywords', 506 'add-location', 'no-location', 'output=', 'output-dir=', 507 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 508 'docstrings', 'no-docstrings', 509 ]) 510 except getopt.error, msg: 511 usage(1, msg) 512 513 # for holding option values 514 class Options: 515 # constants 516 GNU = 1 517 SOLARIS = 2 518 # defaults 519 extractall = 0 # FIXME: currently this option has no effect at all. 520 escape = 0 521 keywords = [] 522 outpath = '' 523 outfile = 'messages.pot' 524 writelocations = 1 525 locationstyle = GNU 526 verbose = 0 527 width = 78 528 excludefilename = '' 529 docstrings = 0 530 nodocstrings = {} 531 532 options = Options() 533 locations = {'gnu' : options.GNU, 534 'solaris' : options.SOLARIS, 535 } 536 537 # parse options 538 for opt, arg in opts: 539 if opt in ('-h', '--help'): 540 usage(0) 541 elif opt in ('-a', '--extract-all'): 542 options.extractall = 1 543 elif opt in ('-d', '--default-domain'): 544 options.outfile = arg + '.pot' 545 elif opt in ('-E', '--escape'): 546 options.escape = 1 547 elif opt in ('-D', '--docstrings'): 548 options.docstrings = 1 549 elif opt in ('-k', '--keyword'): 550 options.keywords.append(arg) 551 elif opt in ('-K', '--no-default-keywords'): 552 default_keywords = [] 553 elif opt in ('-n', '--add-location'): 554 options.writelocations = 1 555 elif opt in ('--no-location',): 556 options.writelocations = 0 557 elif opt in ('-S', '--style'): 558 options.locationstyle = locations.get(arg.lower()) 559 if options.locationstyle is None: 560 usage(1, _('Invalid value for --style: %s') % arg) 561 elif opt in ('-o', '--output'): 562 options.outfile = arg 563 elif opt in ('-p', '--output-dir'): 564 options.outpath = arg 565 elif opt in ('-v', '--verbose'): 566 options.verbose = 1 567 elif opt in ('-V', '--version'): 568 print _('pygettext.py (xgettext for Python) %s') % __version__ 569 sys.exit(0) 570 elif opt in ('-w', '--width'): 571 try: 572 options.width = int(arg) 573 except ValueError: 574 usage(1, _('--width argument must be an integer: %s') % arg) 575 elif opt in ('-x', '--exclude-file'): 576 options.excludefilename = arg 577 elif opt in ('-X', '--no-docstrings'): 578 fp = open(arg) 579 try: 580 while 1: 581 line = fp.readline() 582 if not line: 583 break 584 options.nodocstrings[line[:-1]] = 1 585 finally: 586 fp.close() 587 588 # calculate escapes 589 make_escapes(not options.escape) 590 591 # calculate all keywords 592 options.keywords.extend(default_keywords) 593 594 # initialize list of strings to exclude 595 if options.excludefilename: 596 try: 597 fp = open(options.excludefilename) 598 options.toexclude = fp.readlines() 599 fp.close() 600 except IOError: 601 print >> sys.stderr, _( 602 "Can't read --exclude-file: %s") % options.excludefilename 603 sys.exit(1) 604 else: 605 options.toexclude = [] 606 607 # resolve args to module lists 608 expanded = [] 609 for arg in args: 610 if arg == '-': 611 expanded.append(arg) 612 else: 613 expanded.extend(getFilesForName(arg)) 614 args = expanded 615 616 # slurp through all the files 617 eater = TokenEater(options) 618 for filename in args: 619 if filename == '-': 620 if options.verbose: 621 print _('Reading standard input') 622 fp = sys.stdin 623 closep = 0 624 else: 625 if options.verbose: 626 print _('Working on %s') % filename 627 fp = open(filename) 628 closep = 1 629 try: 630 eater.set_filename(filename) 631 try: 632 tokenize.tokenize(fp.readline, eater) 633 except tokenize.TokenError, e: 634 print >> sys.stderr, '%s: %s, line %d, column %d' % ( 635 e[0], filename, e[1][0], e[1][1]) 636 finally: 637 if closep: 638 fp.close() 639 640 # write the output 641 if options.outfile == '-': 642 fp = sys.stdout 643 closep = 0 644 else: 645 if options.outpath: 646 options.outfile = os.path.join(options.outpath, options.outfile) 647 fp = open(options.outfile, 'w') 648 closep = 1 649 try: 650 eater.write(fp) 651 finally: 652 if closep: 653 fp.close() 654 655 656if __name__ == '__main__': 657 main() 658 # some more test strings 659 _(u'a unicode string') 660 # this one creates a warning 661 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 662 _('more' 'than' 'one' 'string') 663