[Python-checkins] python/dist/src/Lib pyclbr.py,1.26,1.27

Thu, 22 Aug 2002 18:36:03 -0700

Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv15469

Modified Files:
	pyclbr.py 
Log Message:
Rewritten using the tokenize module, which gives us a real tokenizer
rather than a number of approximating regular expressions.
Alas, it is 3-4 times slower.  Let that be a challenge for the
tokenize module.

Index: pyclbr.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/pyclbr.py,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** pyclbr.py	3 Jun 2002 15:58:31 -0000	1.26
--- pyclbr.py	23 Aug 2002 01:36:01 -0000	1.27
***************
*** 5,12 ****

  The interface consists of a single function:
!         readmodule(module, path)
  module is the name of a Python module, path is an optional list of
  directories where the module is to be searched.  If present, path is
! prepended to the system search path sys.path.
  The return value is a dictionary.  The keys of the dictionary are
  the names of the classes defined in the module (including classes
--- 5,13 ----

  The interface consists of a single function:
!         readmodule_ex(module [, path[, inpackage]])
  module is the name of a Python module, path is an optional list of
  directories where the module is to be searched.  If present, path is
! prepended to the system search path sys.path.  (inpackage is used
! internally to search for a submodule of a package.)
  The return value is a dictionary.  The keys of the dictionary are
  the names of the classes defined in the module (including classes
***************
*** 29,38 ****
  shouldn't happen often.

  BUGS
- - Continuation lines are not dealt with at all, except inside strings.
  - Nested classes and functions can confuse it.
- - Code that doesn't pass tabnanny or python -t will confuse it, unless
-   you set the module TABWIDTH vrbl (default 8) to the correct tab width
-   for the file.

  PACKAGE RELATED BUGS
--- 30,37 ----
  shouldn't happen often.

+ XXX describe the Function class.
+ 
  BUGS
  - Nested classes and functions can confuse it.

  PACKAGE RELATED BUGS
***************
*** 53,119 ****
  import sys
  import imp
! import re
! import string

  __all__ = ["readmodule"]

- TABWIDTH = 8
- 
- _getnext = re.compile(r"""
-     (?P<String>
-        \""" [^"\\]* (?:
-                         (?: \\. | "(?!"") )
-                         [^"\\]*
-                     )*
-        \"""
- 
-     |   ''' [^'\\]* (?:
-                         (?: \\. | '(?!'') )
-                         [^'\\]*
-                     )*
-         '''
- 
-     |   " [^"\\\n]* (?: \\. [^"\\\n]*)* "
- 
-     |   ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
-     )
- 
- |   (?P<Method>
-         ^
-         (?P<MethodIndent> [ \t]* )
-         def [ \t]+
-         (?P<MethodName> [a-zA-Z_] \w* )
-         [ \t]* \(
-     )
- 
- |   (?P<Class>
-         ^
-         (?P<ClassIndent> [ \t]* )
-         class [ \t]+
-         (?P<ClassName> [a-zA-Z_] \w* )
-         [ \t]*
-         (?P<ClassSupers> \( [^)\n]* \) )?
-         [ \t]* :
-     )
- 
- |   (?P<Import>
-         ^ import [ \t]+
-         (?P<ImportList> [^#;\n]+ )
-     )
- 
- |   (?P<ImportFrom>
-         ^ from [ \t]+
-         (?P<ImportFromPath>
-             [a-zA-Z_] \w*
-             (?:
-                 [ \t]* \. [ \t]* [a-zA-Z_] \w*
-             )*
-         )
-         [ \t]+
-         import [ \t]+
-         (?P<ImportFromList> [^#;\n]+ )
-     )
- """, re.VERBOSE | re.DOTALL | re.MULTILINE).search
- 
  _modules = {}                           # cache of modules we've seen

--- 52,60 ----
  import sys
  import imp
! import tokenize # Python tokenizer
! from token import NAME

  __all__ = ["readmodule"]

  _modules = {}                           # cache of modules we've seen

***************
*** 141,145 ****
          assert 0, "Function._addmethod() shouldn't be called"

! def readmodule(module, path=[], inpackage=0):
      '''Backwards compatible interface.

--- 82,86 ----
          assert 0, "Function._addmethod() shouldn't be called"

! def readmodule(module, path=[], inpackage=False):
      '''Backwards compatible interface.

***************
*** 154,158 ****
      return res

! def readmodule_ex(module, path=[], inpackage=0):
      '''Read a module file and return a dictionary of classes.

--- 95,99 ----
      return res

! def readmodule_ex(module, path=[], inpackage=False):
      '''Read a module file and return a dictionary of classes.

***************
*** 169,173 ****
          submodule = module[i+1:].strip()
          parent = readmodule_ex(package, path, inpackage)
!         child = readmodule_ex(submodule, parent['__path__'], 1)
          return child

--- 110,114 ----
          submodule = module[i+1:].strip()
          parent = readmodule_ex(package, path, inpackage)
!         child = readmodule_ex(submodule, parent['__path__'], True)
          return child

***************
*** 205,332 ****
      _modules[module] = dict
      classstack = [] # stack of (class, indent) pairs
-     src = f.read()
-     f.close()
- 
-     # To avoid having to stop the regexp at each newline, instead
-     # when we need a line number we simply count the number of
-     # newlines in the string since the last time we did this; i.e.,
-     #    lineno += src.count('\n', last_lineno_pos, here)
-     #    last_lineno_pos = here
-     lineno, last_lineno_pos = 1, 0
-     i = 0
-     while 1:
-         m = _getnext(src, i)
-         if not m:
-             break
-         start, i = m.span()
- 
-         if m.start("Method") >= 0:
-             # found a method definition or function
-             thisindent = _indent(m.group("MethodIndent"))
-             meth_name = m.group("MethodName")
-             lineno += src.count('\n', last_lineno_pos, start)
-             last_lineno_pos = start
-             # close all classes indented at least as much
-             while classstack and \
-                   classstack[-1][1] >= thisindent:
-                 del classstack[-1]
-             if classstack:
-                 # it's a class method
-                 cur_class = classstack[-1][0]
-                 cur_class._addmethod(meth_name, lineno)
-             else:
-                 # it's a function
-                 f = Function(module, meth_name,
-                              file, lineno)
-                 dict[meth_name] = f

!         elif m.start("String") >= 0:
!             pass
! 
!         elif m.start("Class") >= 0:
!             # we found a class definition
!             thisindent = _indent(m.group("ClassIndent"))
!             # close all classes indented at least as much
!             while classstack and \
!                   classstack[-1][1] >= thisindent:
!                 del classstack[-1]
!             lineno += src.count('\n', last_lineno_pos, start)
!             last_lineno_pos = start
!             class_name = m.group("ClassName")
!             inherit = m.group("ClassSupers")
!             if inherit:
!                 # the class inherits from other classes
!                 inherit = inherit[1:-1].strip()
!                 names = []
!                 for n in inherit.split(','):
!                     n = n.strip()
!                     if n in dict:
!                         # we know this super class
!                         n = dict[n]
!                     else:
!                         c = n.split('.')
!                         if len(c) > 1:
!                             # super class
!                             # is of the
!                             # form module.class:
!                             # look in
!                             # module for class
!                             m = c[-2]
!                             c = c[-1]
!                             if m in _modules:
!                                 d = _modules[m]
!                                 if c in d:
!                                     n = d[c]
!                     names.append(n)
!                 inherit = names
!             # remember this class
!             cur_class = Class(module, class_name, inherit,
!                               file, lineno)
!             dict[class_name] = cur_class
!             classstack.append((cur_class, thisindent))
! 
!         elif m.start("Import") >= 0:
!             # import module
!             for n in m.group("ImportList").split(','):
!                 n = n.strip()
                  try:
                      # recursively read the imported module
!                     d = readmodule_ex(n, path, inpackage)
                  except:
!                     ##print 'module', n, 'not found'
!                     pass
! 
!         elif m.start("ImportFrom") >= 0:
!             # from module import stuff
!             mod = m.group("ImportFromPath")
!             names = m.group("ImportFromList").split(',')
!             try:
!                 # recursively read the imported module
!                 d = readmodule_ex(mod, path, inpackage)
!             except:
!                 ##print 'module', mod, 'not found'
!                 continue
!             # add any classes that were defined in the
!             # imported module to our name space if they
!             # were mentioned in the list
!             for n in names:
!                 n = n.strip()
!                 if n in d:
!                     dict[n] = d[n]
!                 elif n == '*':
!                     # only add a name if not
!                     # already there (to mimic what
!                     # Python does internally)
!                     # also don't add names that
!                     # start with _
!                     for n in d:
!                         if n[0] != '_' and \
!                            not n in dict:
!                             dict[n] = d[n]
!         else:
!             assert 0, "regexp _getnext found something unexpected"

      return dict

! def _indent(ws, _expandtabs=string.expandtabs):
!     return len(_expandtabs(ws, TABWIDTH))
--- 146,288 ----
      _modules[module] = dict
      classstack = [] # stack of (class, indent) pairs

!     g = tokenize.generate_tokens(f.readline)
!     try:
!         for tokentype, token, start, end, line in g:
!             if token == 'def':
!                 lineno, thisindent = start
!                 tokentype, meth_name, start, end, line = g.next()
!                 if tokentype != NAME:
!                     continue # Syntax error
!                 # close all classes indented at least as much
!                 while classstack and \
!                       classstack[-1][1] >= thisindent:
!                     del classstack[-1]
!                 if classstack:
!                     # it's a class method
!                     cur_class = classstack[-1][0]
!                     cur_class._addmethod(meth_name, lineno)
!                 else:
!                     # it's a function
!                     dict[meth_name] = Function(module, meth_name, file, lineno)
!             elif token == 'class':
!                 lineno, thisindent = start
!                 tokentype, class_name, start, end, line = g.next()
!                 if tokentype != NAME:
!                     continue # Syntax error
!                 # close all classes indented at least as much
!                 while classstack and \
!                       classstack[-1][1] >= thisindent:
!                     del classstack[-1]
!                 # parse what follows the class name
!                 tokentype, token, start, end, line = g.next()
!                 inherit = None
!                 if token == '(':
!                     names = [] # List of superclasses
!                     # there's a list of superclasses
!                     level = 1
!                     super = [] # Tokens making up current superclass
!                     while True:
!                         tokentype, token, start, end, line = g.next()
!                         if token in (')', ',') and level == 1:
!                             n = "".join(super)
!                             if n in dict:
!                                 # we know this super class
!                                 n = dict[n]
!                             else:
!                                 c = n.split('.')
!                                 if len(c) > 1:
!                                     # super class is of the form
!                                     # module.class: look in module for
!                                     # class
!                                     m = c[-2]
!                                     c = c[-1]
!                                     if m in _modules:
!                                         d = _modules[m]
!                                         if c in d:
!                                             n = d[c]
!                             names.append(n)
!                         if token == '(':
!                             level += 1
!                         elif token == ')':
!                             level -= 1
!                             if level == 0:
!                                 break
!                         elif token == ',' and level == 1:
!                             pass
!                         else:
!                             super.append(token)
!                     inherit = names
!                 cur_class = Class(module, class_name, inherit, file, lineno)
!                 dict[class_name] = cur_class
!                 classstack.append((cur_class, thisindent))
!             elif token == 'import' and start[1] == 0:
!                 modules = _getnamelist(g)
!                 for mod, mod2 in modules:
!                     readmodule_ex(mod, path, inpackage)
!             elif token == 'from' and start[1] == 0:
!                 mod, token = _getname(g)
!                 if not mod or token != "import":
!                     continue
!                 names = _getnamelist(g)
                  try:
                      # recursively read the imported module
!                     d = readmodule_ex(mod, path, inpackage)
                  except:
!                     continue
!                 # add any classes that were defined in the imported module
!                 # to our name space if they were mentioned in the list
!                 for n, n2 in names:
!                     if n in d:
!                         dict[n2 or n] = d[n]
!                     elif n == '*':
!                         # only add a name if not already there (to mimic
!                         # what Python does internally) also don't add
!                         # names that start with _
!                         for n in d:
!                             if n[0] != '_' and not n in dict:
!                                 dict[n] = d[n]
!     except StopIteration:
!         pass

+     f.close()
      return dict

! def _getnamelist(g):
!     # Helper to get a comma-separated list of dotted names plus 'as'
!     # clauses.  Return a list of pairs (name, name2) where name2 is
!     # the 'as' name, or None if there is no 'as' clause.
!     names = []
!     while True:
!         name, token = _getname(g)
!         if not name:
!             break
!         if token == 'as':
!             name2, token = _getname(g)
!         else:
!             name2 = None
!         names.append((name, name2))
!         while token != "," and "\n" not in token:
!             tokentype, token, start, end, line = g.next()
!         if token != ",":
!             break
!     return names
! 
! def _getname(g):
!     # Helper to get a dotted name, return a pair (name, token) where
!     # name is the dotted name, or None if there was no dotted name,
!     # and token is the next input token.
!     parts = []
!     tokentype, token, start, end, line = g.next()
!     if tokentype != NAME and token != '*':
!         return (None, token)
!     parts.append(token)
!     while True:
!         tokentype, token, start, end, line = g.next()
!         if token != '.':
!             break
!         tokentype, token, start, end, line = g.next()
!         if tokentype != NAME:
!             break
!         parts.append(token)
!     return (".".join(parts), token)