[Python-Dev] Python syntax checker ?

Jeremy Hylton jeremy@beopen.com
Mon, 25 Sep 2000 10:50:30 -0400 (EDT)


>>>>> "JCA" == James C Ahlstrom <jim@interet.com> writes:

  JCA> The following is based on trying (a great learning experience)
  JCA> to write a better Python lint.

  JCA> There are IMHO two problems with the current Python grammar
  JCA> file.  It is not possible to express operator precedence, so
  JCA> deliberate shift/reduce conflicts are used instead.  That makes
  JCA> the parse tree complicated and non intuitive.  And there is no
  JCA> provision for error productions.  YACC has both of these as
  JCA> built-in features.

  JCA> I also found speed problems with tokenize.py.  AFAIK, it only
  JCA> exists because tokenizer.c does not provide comments as tokens,
  JCA> but eats them instead.  We could modify tokenizer.c, then make
  JCA> tokenize.py be the interface to the fast C tokenizer.  This
  JCA> eliminates the problem of updating both too.

  JCA> So how about re-writing the Python grammar in YACC in order to
  JCA> use its more advanced features??  The simple YACC grammar I
  JCA> wrote for 1.5.2 plus an altered tokenizer.c parsed the whole
  JCA> Lib/*.py in a couple seconds vs. 30 seconds for the first file
  JCA> using Aaron Watters' Python lint grammar written in Python.

Why not use the actual Python parser instead of tokenize.py?  I assume
it is also faster than Aaron's Python lint grammer written in Python.
The compiler in Tools/compiler uses the parser module internally and
produces an AST that is straightforward to use.  (The parse tree
produced by the parser module is fairly low-level.)

There was a thread (on the compiler-sig, I believe) where Moshe and I
noodled with a simple lint-like warnings framework based on the
compiler package.  I don't have the code we ended up with, but I found
an example checker in the mail archives and have included it below.
It checks for NameErrors.

I believe one useful change that Moshe and I arrived at was to avoid
the explicit stack that the code uses (via enterNamespace and
exitNamespace) and instead pass the namespace as an optional extra
argument to the visitXXX methods.

Jeremy

"""Check for NameErrors"""

from compiler import parseFile, walk
from compiler.misc import Stack, Set

import __builtin__
from UserDict import UserDict

class Warning:
    def __init__(self, filename, funcname, lineno):
        self.filename = filename
        self.funcname = funcname
        self.lineno = lineno

    def __str__(self):
        return self._template % self.__dict__

class UndefinedLocal(Warning):
    super_init = Warning.__init__
    
    def __init__(self, filename, funcname, lineno, name):
        self.super_init(filename, funcname, lineno)
        self.name = name

    _template = "%(filename)s:%(lineno)s  " \
                "%(funcname)s undefined local %(name)s"

class NameError(UndefinedLocal):
    _template = "%(filename)s:%(lineno)s  " \
                "%(funcname)s undefined name %(name)s"

class NameSet(UserDict):
    """Track names and the line numbers where they are referenced"""
    def __init__(self):
        self.data = self.names = {}

    def add(self, name, lineno):
        l = self.names.get(name, [])
        l.append(lineno)
        self.names[name] = l

class CheckNames:
    def __init__(self, filename):
        self.filename = filename
        self.warnings = []
        self.scope = Stack()
        self.gUse = NameSet()
        self.gDef = NameSet()
        # _locals is the stack of local namespaces
        # locals is the top of the stack
        self._locals = Stack()
        self.lUse = None
        self.lDef = None
        self.lGlobals = None # var declared global
        # holds scope,def,use,global triples for later analysis
        self.todo = []

    def enterNamespace(self, node):
        self.scope.push(node)
        self.lUse = use = NameSet()
        self.lDef = _def = NameSet()
        self.lGlobals = gbl = NameSet()
        self._locals.push((use, _def, gbl))

    def exitNamespace(self):
        self.todo.append((self.scope.top(), self.lDef, self.lUse,
                          self.lGlobals))
        self.scope.pop()
        self._locals.pop()
        if self._locals:
            self.lUse, self.lDef, self.lGlobals = self._locals.top()
        else:
            self.lUse = self.lDef = self.lGlobals = None

    def warn(self, warning, funcname, lineno, *args):
        args = (self.filename, funcname, lineno) + args
        self.warnings.append(apply(warning, args))

    def defName(self, name, lineno, local=1):
        if self.lUse is None:
            self.gDef.add(name, lineno)
        elif local == 0:
            self.gDef.add(name, lineno)
            self.lGlobals.add(name, lineno)
        else:
            self.lDef.add(name, lineno)

    def useName(self, name, lineno, local=1):
        if self.lUse is None:
            self.gUse.add(name, lineno)
        elif local == 0:
            self.gUse.add(name, lineno)
            self.lUse.add(name, lineno)            
        else:
            self.lUse.add(name, lineno)

    def check(self):
        for s, d, u, g in self.todo:
            self._check(s, d, u, g, self.gDef)
        # XXX then check the globals

    def _check(self, scope, _def, use, gbl, globals):
        # check for NameError
        # a name is defined iff it is in def.keys()
        # a name is global iff it is in gdefs.keys()
        gdefs = UserDict()
        gdefs.update(globals)
        gdefs.update(__builtin__.__dict__)
        defs = UserDict()
        defs.update(gdefs)
        defs.update(_def)
        errors = Set()
        for name in use.keys():
            if not defs.has_key(name):
                firstuse = use[name][0]
                self.warn(NameError, scope.name, firstuse, name)
                errors.add(name)

        # check for UndefinedLocalNameError
        # order == use & def sorted by lineno
        # elements are lineno, flag, name
        # flag = 0 if use, flag = 1 if def
        order = []
        for name, lines in use.items():
            if gdefs.has_key(name) and not _def.has_key(name):
                # this is a global ref, we can skip it
                continue
            for lineno in lines:
                order.append((lineno, 0, name))
        for name, lines in _def.items():
            for lineno in lines:
                order.append((lineno, 1, name))
        order.sort()
        # ready contains names that have been defined or warned about
        ready = Set()
        for lineno, flag, name in order:
            if flag == 0: # use
                if not ready.has_elt(name) and not errors.has_elt(name):
                    self.warn(UndefinedLocal, scope.name, lineno, name)
                    ready.add(name) # don't warn again
            else:
                ready.add(name)

    # below are visitor methods

    def visitFunction(self, node, noname=0):
        for expr in node.defaults:
            self.visit(expr)
        if not noname:
            self.defName(node.name, node.lineno)
        self.enterNamespace(node)
        for name in node.argnames:
            self.defName(name, node.lineno)
        self.visit(node.code)
        self.exitNamespace()
        return 1

    def visitLambda(self, node):
        return self.visitFunction(node, noname=1)

    def visitClass(self, node):
        for expr in node.bases:
            self.visit(expr)
        self.defName(node.name, node.lineno)
        self.enterNamespace(node)
        self.visit(node.code)
        self.exitNamespace()
        return 1

    def visitName(self, node):
        self.useName(node.name, node.lineno)

    def visitGlobal(self, node):
        for name in node.names:
            self.defName(name, node.lineno, local=0)

    def visitImport(self, node):
        for name, alias in node.names:
            self.defName(alias or name, node.lineno)

    visitFrom = visitImport

    def visitAssName(self, node):
        self.defName(node.name, node.lineno)
    
def check(filename):
    global p, checker
    p = parseFile(filename)
    checker = CheckNames(filename)
    walk(p, checker)
    checker.check()
    for w in checker.warnings:
        print w

if __name__ == "__main__":
    import sys

    # XXX need to do real arg processing
    check(sys.argv[1])