[I18n-sig] pygettext.py extraction of docstrings
Barry A. Warsaw
barry@wooz.org
Thu, 26 Oct 2000 15:48:33 -0400 (EDT)
Hi all,
I have a tentative patch for Tools/i18n/pygettext.py which adds
optional extraction of module, class, method, and function
docstrings. Francois Pinard's po-utils does something similar I
believe, and it makes a lot of sense to add this.
If you provide the -D/--docstrings flag, then it'll extract these
docstrings without requiring them to be wrapped in _() markers. You'd
of course, still need to send the strings through a translation step
but that's okay because you'll probably want deferred translation of
them anyway.
I've only done some minimal testing so I don't know how easy it is to
confuse the TokenEater.
One question: should docstring extraction be turned on my default?
Attached is a patch against Python 2.0's pygettext.py.
-Barry
-------------------- snip snip --------------------
Index: pygettext.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Tools/i18n/pygettext.py,v
retrieving revision 1.9
diff -u -r1.9 pygettext.py
--- pygettext.py 2000/05/02 19:28:30 1.9
+++ pygettext.py 2000/10/26 19:43:18
@@ -4,16 +4,8 @@
# minimally patched to make it even more xgettext compatible
# by Peter Funk <pf@artcom-gmbh.de>
-# for selftesting
-try:
- import fintl
- _ = fintl.gettext
-except ImportError:
- def _(s): return s
+"""pygettext -- Python equivalent of xgettext(1)
-
-__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
-
Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
internationalization of C programs. Most of these tools are independent of
the programming language and can be used from within Python programs. Martin
@@ -65,7 +57,12 @@
-E
--escape
- replace non-ASCII characters with octal escape sequences.
+ Replace non-ASCII characters with octal escape sequences.
+
+ -D
+ --docstrings
+ Extract module, class, method, and function docstrings. This requires
+ an import of the specified module, so beware of import side effects.
-h
--help
@@ -132,15 +129,22 @@
If `inputfile' is -, standard input is read.
-""")
+"""
import os
import sys
import time
import getopt
import tokenize
+
+# for selftesting
+try:
+ import fintl
+ _ = fintl.gettext
+except ImportError:
+ def _(s): return s
-__version__ = '1.1'
+__version__ = '1.2'
default_keywords = ['_']
DEFAULTKEYWORDS = ', '.join(default_keywords)
@@ -171,9 +175,9 @@
def usage(code, msg=''):
- print __doc__ % globals()
+ print >> sys.stderr, _(__doc__) % globals()
if msg:
- print msg
+ print >> sys.stderr, msg
sys.exit(code)
@@ -239,15 +243,48 @@
self.__state = self.__waiting
self.__data = []
self.__lineno = -1
+ self.__freshmodule = 1
def __call__(self, ttype, tstring, stup, etup, line):
# dispatch
+## import token
+## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
+## 'tstring:', tstring
self.__state(ttype, tstring, stup[0])
def __waiting(self, ttype, tstring, lineno):
+ # Do docstring extractions, if enabled
+ if self.__options.docstrings:
+ # module docstring?
+ if self.__freshmodule:
+ if ttype == tokenize.STRING:
+ self.__addentry(safe_eval(tstring), lineno)
+ self.__freshmodule = 0
+ elif ttype not in (tokenize.COMMENT, tokenize.NL):
+ self.__freshmodule = 0
+ return
+ # class docstring?
+ if ttype == tokenize.NAME and tstring in ('class', 'def'):
+ self.__state = self.__suiteseen
+ return
if ttype == tokenize.NAME and tstring in self.__options.keywords:
self.__state = self.__keywordseen
+ def __suiteseen(self, ttype, tstring, lineno):
+ # ignore anything until we see the colon
+ if ttype == tokenize.OP and tstring == ':':
+ self.__state = self.__suitedocstring
+
+ def __suitedocstring(self, ttype, tstring, lineno):
+ # ignore any intervening noise
+ if ttype == tokenize.STRING:
+ self.__addentry(safe_eval(tstring), lineno)
+ self.__state = self.__waiting
+ elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
+ tokenize.COMMENT):
+ # there was no class docstring
+ self.__state = self.__waiting
+
def __keywordseen(self, ttype, tstring, lineno):
if ttype == tokenize.OP and tstring == '(':
self.__data = []
@@ -263,58 +300,54 @@
# of messages seen. Reset state for the next batch. If there
# were no strings inside _(), then just ignore this entry.
if self.__data:
- msg = EMPTYSTRING.join(self.__data)
- if not msg in self.__options.toexclude:
- entry = (self.__curfile, self.__lineno)
- linenos = self.__messages.get(msg)
- if linenos is None:
- self.__messages[msg] = [entry]
- else:
- linenos.append(entry)
+ self.__addentry(EMPTYSTRING.join(self.__data))
self.__state = self.__waiting
elif ttype == tokenize.STRING:
self.__data.append(safe_eval(tstring))
# TBD: should we warn if we seen anything else?
+ def __addentry(self, msg, lineno=None):
+ if lineno is None:
+ lineno = self.__lineno
+ if not msg in self.__options.toexclude:
+ entry = (self.__curfile, lineno)
+ self.__messages.setdefault(msg, []).append(entry)
+
def set_filename(self, filename):
self.__curfile = filename
def write(self, fp):
options = self.__options
timestamp = time.ctime(time.time())
- # common header
- try:
- sys.stdout = fp
- # The time stamp in the header doesn't have the same format
- # as that generated by xgettext...
- print pot_header % {'time': timestamp, 'version': __version__}
- for k, v in self.__messages.items():
- if not options.writelocations:
- pass
- # location comments are different b/w Solaris and GNU:
- elif options.locationstyle == options.SOLARIS:
- for filename, lineno in v:
- d = {'filename': filename, 'lineno': lineno}
- print _('# File: %(filename)s, line: %(lineno)d') % d
- elif options.locationstyle == options.GNU:
- # fit as many locations on one line, as long as the
- # resulting line length doesn't exceeds 'options.width'
- locline = '#:'
- for filename, lineno in v:
- d = {'filename': filename, 'lineno': lineno}
- s = _(' %(filename)s:%(lineno)d') % d
- if len(locline) + len(s) <= options.width:
- locline = locline + s
- else:
- print locline
- locline = "#:" + s
- if len(locline) > 2:
- print locline
- # TBD: sorting, normalizing
- print 'msgid', normalize(k)
- print 'msgstr ""\n'
- finally:
- sys.stdout = sys.__stdout__
+ # The time stamp in the header doesn't have the same format as that
+ # generated by xgettext...
+ print >> fp, pot_header % {'time': timestamp, 'version': __version__}
+ for k, v in self.__messages.items():
+ if not options.writelocations:
+ pass
+ # location comments are different b/w Solaris and GNU:
+ elif options.locationstyle == options.SOLARIS:
+ for filename, lineno in v:
+ d = {'filename': filename, 'lineno': lineno}
+ print >> fp, _('# File: %(filename)s, line: %(lineno)d') \
+ % d
+ elif options.locationstyle == options.GNU:
+ # fit as many locations on one line, as long as the
+ # resulting line length doesn't exceeds 'options.width'
+ locline = '#:'
+ for filename, lineno in v:
+ d = {'filename': filename, 'lineno': lineno}
+ s = _(' %(filename)s:%(lineno)d') % d
+ if len(locline) + len(s) <= options.width:
+ locline = locline + s
+ else:
+ print >> fp, locline
+ locline = "#:" + s
+ if len(locline) > 2:
+ print >> fp, locline
+ # TBD: sorting, normalizing
+ print >> fp, 'msgid', normalize(k)
+ print >> fp, 'msgstr ""\n'
def main():
@@ -322,11 +355,12 @@
try:
opts, args = getopt.getopt(
sys.argv[1:],
- 'ad:Ehk:Kno:p:S:Vvw:x:',
+ 'ad:DEhk:Kno:p:S:Vvw:x:',
['extract-all', 'default-domain', 'escape', 'help',
'keyword=', 'no-default-keywords',
'add-location', 'no-location', 'output=', 'output-dir=',
'style=', 'verbose', 'version', 'width=', 'exclude-file=',
+ 'docstrings',
])
except getopt.error, msg:
usage(1, msg)
@@ -347,6 +381,7 @@
verbose = 0
width = 78
excludefilename = ''
+ docstrings = 0
options = Options()
locations = {'gnu' : options.GNU,
@@ -363,6 +398,8 @@
options.outfile = arg + '.pot'
elif opt in ('-E', '--escape'):
options.escape = 1
+ elif opt in ('-D', '--docstrings'):
+ options.docstrings = 1
elif opt in ('-k', '--keyword'):
options.keywords.append(arg)
elif opt in ('-K', '--no-default-keywords'):