[pypy-svn] r12814 - pypy/branch/pycompiler/module/recparser

Thu May 26 19:18:06 CEST 2005

Author: adim
Date: Thu May 26 19:18:06 2005
New Revision: 12814

Modified:
   pypy/branch/pycompiler/module/recparser/ebnflexer.py
   pypy/branch/pycompiler/module/recparser/ebnfparse.py
   pypy/branch/pycompiler/module/recparser/grammar.py
   pypy/branch/pycompiler/module/recparser/pythonlexer.py
   pypy/branch/pycompiler/module/recparser/pythonparse.py
Log:
- added lookehead (don't try to match rule when the next token
  is not in the rule's first set)
- added small tests 
- Still unfinished, and breaks (for now) test_samples.py



Modified: pypy/branch/pycompiler/module/recparser/ebnflexer.py
==============================================================================

--- pypy/branch/pycompiler/module/recparser/ebnflexer.py	(original)
+++ pypy/branch/pycompiler/module/recparser/ebnflexer.py	Thu May 26 19:18:06 2005
@@ -21,6 +21,7 @@
         TokenSource.__init__(self)
         self.input = inpstring
         self.pos = 0
+        self._peeked = None
 
     def context(self):
         return self.pos
@@ -36,6 +37,10 @@
         self.pos = ctx
 
     def next(self):
+        if self._peeked is not None:
+            self._peeked = None
+            return self._peeked
+        
         pos = self.pos
         inp = self.input
         m = g_skip.match(inp, pos)
@@ -67,5 +72,11 @@
             return 'SYMBOL',tk
         raise ValueError("Unknown token at pos=%d context='%s'" % (pos,inp[pos:pos+20]) )
 
+    def peek(self):
+##         self._peeked = None
+##         self._peeked = self.next()
+##         return self._peeked
+        return None
+
     def debug(self):
         return self.input[self.pos:self.pos+20]

Modified: pypy/branch/pycompiler/module/recparser/ebnfparse.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/ebnfparse.py	(original)
+++ pypy/branch/pycompiler/module/recparser/ebnfparse.py	Thu May 26 19:18:06 2005
@@ -32,7 +32,7 @@
         Token.__init__(self, "NAME")
         self.keywords = keywords
 
-    def match(self, source, builder):
+    def match(self, source, builder, level=0):
         """Matches a token.
         the default implementation is to match any token whose type
         corresponds to the object's name. You can extend Token

Modified: pypy/branch/pycompiler/module/recparser/grammar.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/grammar.py	(original)
+++ pypy/branch/pycompiler/module/recparser/grammar.py	Thu May 26 19:18:06 2005
@@ -38,6 +38,22 @@
 
 ######################################################################
 
+
+def build_first_sets(rules):
+    # PSEUDO CODE
+    changed = True
+    loops = 0
+    while changed:
+        loops += 1
+        changed = False
+        for rule in rules:
+            size = len(rule.first_set)
+            rule.calc_first_set()
+            new_size = len(rule.first_set)
+            if new_size != size:
+                changed = True
+    print "Done", loops, "loops"
+
 from syntaxtree import SyntaxNode, TempSyntaxNode, TokenNode
 
 class BaseGrammarBuilder(object):
@@ -102,6 +118,9 @@
         self.name = name
         self.args = []
         self._is_root = False
+        self.first_set = []
+        self.first_set_complete = False
+        self._processing = False
 
     def is_root(self):
         """This is a root node of the grammar, that is one that will
@@ -110,7 +129,35 @@
             return False
         return True
 
-    def match(self, source, builder):
+    def match(self, source, builder, level=0):
+        """Try to match a grammar rule
+
+        If next set of tokens matches this grammar element, use <builder>
+        to build an appropriate object, otherwise returns None.
+
+        /!\ If the sets of element didn't match the current grammar
+        element, then the <source> is restored as it was before the
+        call to the match() method
+
+        returns None if no match or an object build by builder
+        """
+        tok_tuple = source.peek()
+        # <tmpdebug>
+        if tok_tuple is not None:
+            if tok_tuple not in self.first_set:
+                prefix = '%s<<<' % ('  ' * level)
+            else:
+                prefix = '%s>>>' % ('  ' * level)
+            print prefix, " TOKEN =", tok_tuple
+            print prefix, " RULE =", self
+            print prefix, " FIRST SET =", self.first_set
+            print "*" * 50
+        # </tmpdebug>
+        if tok_tuple is not None and EmptyToken not in self.first_set and tok_tuple not in self.first_set:
+            return None
+        return self._match(source, builder, level)
+
+    def _match(self, source, builder, level=0):
         """Try to match a grammar rule
 
         If next set of tokens matches this grammar element, use <builder>
@@ -131,13 +178,13 @@
         # To consider if we need to improve speed in parsing
         pass
 
-    def first_set(self):
-        """Returns a list of possible tokens that can start this rule
-        None means the rule can be empty
-        """
-        # **NOT USED** **NOT IMPLEMENTED**
-        # To consider if we need to improve speed in parsing
-        pass
+##     def calc_first_set(self):
+##         """Returns a list of possible tokens that can start this rule
+##         None means the rule can be empty
+##         """
+##         # **NOT USED** **NOT IMPLEMENTED**
+##         # To consider if we need to improve speed in parsing
+##         pass
 
     def __str__(self):
         return self.display(0)
@@ -159,6 +206,15 @@
             print "matched %s (%s): %s" % (self.__class__.__name__, sargs, self.display() )
         return ret
 
+    
+    def calc_first_set(self):
+        """returns the list of possible next tokens
+        *must* be implemented in subclasses
+        """
+        # XXX: first_set could probably be implemented with sets
+        return []
+
+
 class Alternative(GrammarElement):
     """Represents an alternative in a grammar rule (as in S -> A | B | C)"""
     def __init__(self, name, *args):
@@ -167,17 +223,21 @@
         for i in self.args:
             assert isinstance( i, GrammarElement )
 
-    def match(self, source, builder):
+    def _match(self, source, builder, level=0):
         """If any of the rules in self.args matches
         returns the object built from the first rules that matches
         """
         if DEBUG>1:
             print "try alt:", self.display()
+        tok = source.peek()
         # Here we stop at the first match we should
         # try instead to get the longest alternative
         # to see if this solve our problems with infinite recursion
         for rule in self.args:
-            m = rule.match( source, builder )
+            if tok is not None and tok not in rule.first_set:
+                print "Skipping impossible rule: %s" % (rule,)
+                continue
+            m = rule.match(source, builder, level+1)
             if m:
                 ret = builder.alternative( self, source )
                 return self.debug_return( ret )
@@ -192,7 +252,18 @@
             name = ""
         items = [ a.display(1) for a in self.args ]
         return name+"(" + "|".join( items ) + ")"
-        
+
+    def calc_first_set(self):
+        """returns the list of possible next tokens
+        if S -> (A | B | C):
+            LAH(S) = Union( LAH(A), LAH(B), LAH(C) )
+        """
+        # do this to avoid problems on indirect recursive rules
+        for rule in self.args:
+            for t in rule.first_set:
+                if t not in self.first_set:
+                    self.first_set.append(t)
+
 
 class Sequence(GrammarElement):
     """Reprensents a Sequence in a grammar rule (as in S -> A B C)"""
@@ -202,14 +273,20 @@
         for i in self.args:
             assert isinstance( i, GrammarElement )
 
-    def match(self, source, builder):
+    def _match(self, source, builder, level=0):
         """matches all of the symbols in order"""
         if DEBUG>1:
             print "try seq:", self.display()
         ctx = source.context()
         bctx = builder.context()
+        if self.name == 'listmaker':
+            print "----------------------------- LISTMAKER !"
         for rule in self.args:
-            m = rule.match(source, builder)
+            if self.name == 'listmaker':
+                print "    -------------- IN LISTMAKER, rule =", rule
+            m = rule.match(source, builder, level+1)
+            if self.name == 'listmaker':
+                print "    !!!!!!!!!!!!!! IN LISTMAKER, doesn't match %s" % (rule,)
             if not m:
                 # Restore needed because some rules may have been matched
                 # before the one that failed
@@ -229,6 +306,24 @@
         items = [a.display(1) for a in self.args]
         return name + "(" + " ".join( items ) + ")"
 
+    def calc_first_set(self):
+        """returns the list of possible next tokens
+        if S -> A* B C:
+            LAH(S) = Union( LAH(A), LAH(B) )
+        if S -> A+ B C:
+            LAH(S) = LAH(A)
+        if S -> A B C:
+            LAH(S) = LAH(A)
+        """
+        for rule in self.args:
+            # while we're in this loop, keep agregating possible tokens
+            for t in rule.first_set:
+                if t not in self.first_set:
+                    self.first_set.append(t)
+            if EmptyToken not in rule.first_set:
+                break
+
+
 class KleenStar(GrammarElement):
     """Represents a KleenStar in a grammar rule as in (S -> A+) or (S -> A*)"""
     def __init__(self, name, _min = 0, _max = -1, rule=None):
@@ -239,8 +334,10 @@
             raise ValueError("KleenStar needs max==-1 or max>1")
         self.max = _max
         self.star = "x"
+        if self.min == 0:
+            self.first_set.append( EmptyToken )
 
-    def match(self, source, builder):
+    def _match(self, source, builder, level=0):
         """matches a number of times self.args[0]. the number must be comprised
         between self._min and self._max inclusive. -1 is used to represent infinity"""
         if DEBUG>1:
@@ -250,7 +347,7 @@
         rules = 0
         rule = self.args[0]
         while True:
-            m = rule.match(source, builder)
+            m = rule.match(source, builder, level+1)
             if not m:
                 # Rule should be matched at least 'min' times
                 if rules<self.min:
@@ -281,14 +378,27 @@
         s = self.args[0].display(1)
         return name + "%s%s" % (s, star)
 
-            
+
+    def calc_first_set(self):
+        """returns the list of possible next tokens
+        if S -> A*:
+            LAH(S) = Union( LAH(A), EmptyToken )
+        if S -> A+:
+            LAH(S) = LAH(A)
+        """
+        rule = self.args[0]
+        self.first_set = rule.first_set[:]
+        if self.min == 0 and EmptyToken not in self.first_set:
+            self.first_set.append(EmptyToken)
+
 class Token(GrammarElement):
     """Represents a Token in a grammar rule (a lexer token)"""
     def __init__( self, name, value = None):
         GrammarElement.__init__( self, name )
         self.value = value
+        self.first_set = [self]
 
-    def match(self, source, builder):
+    def _match(self, source, builder, level=0):
         """Matches a token.
         the default implementation is to match any token whose type
         corresponds to the object's name. You can extend Token
@@ -301,7 +411,7 @@
         """
         ctx = source.context()
         tk_type, tk_value = source.next()
-        if tk_type==self.name:
+        if tk_type == self.name:
             if self.value is None:
                 ret = builder.token( tk_type, tk_value, source )
                 return self.debug_return( ret, tk_type )
@@ -320,3 +430,28 @@
             return "<%s>=='%s'" % (self.name, self.value)
     
 
+    def __eq__(self, other):
+        """convenience '==' implementation, this is *not* a *real* equality test
+        a Token instance can be compared to:
+         - another Token instance in which case all fields (name and value)
+           must be equal
+         - a tuple, such as those yielded by the Python lexer, in which case
+           the comparison algorithm is similar to the one in match()
+           XXX refactor match and __eq__ ?
+        """
+        if other is None:
+            return False
+        elif isinstance(other, Token):
+            return self.value == other.value and self.name == other.name
+        elif isinstance(other, tuple) and len(other) == 2:
+            tk_type, tk_value = other
+            return tk_type == self.name and self.value in (None, tk_value)
+        return False
+
+    
+    def calc_first_set(self):
+        """returns the list of possible next tokens
+        """
+        pass
+
+EmptyToken = object() # Token('???')

Modified: pypy/branch/pycompiler/module/recparser/pythonlexer.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/pythonlexer.py	(original)
+++ pypy/branch/pycompiler/module/recparser/pythonlexer.py	Thu May 26 19:18:06 2005
@@ -223,6 +223,14 @@
         if DEBUG:
             print "%d/%d: %s, %s" % (self.stack_pos, len(self.stack), tok, val)
         return (tok, val)
+
+    def peek(self):
+        """returns next token without consuming it"""
+        ctx = self.context()
+        tok_tuple = self.next()
+        self.restore(ctx)
+        return tok_tuple
+        
             
     def end_of_file(self):
         """return DEDENT and ENDMARKER"""

Modified: pypy/branch/pycompiler/module/recparser/pythonparse.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/pythonparse.py	(original)
+++ pypy/branch/pycompiler/module/recparser/pythonparse.py	Thu May 26 19:18:06 2005
@@ -16,6 +16,8 @@
     grammar.DEBUG = 0
     gram = parse_grammar( file(PYTHON_GRAMMAR) )
     grammar.DEBUG = level
+    # Build first sets for each rule (including anonymous ones)
+    grammar.build_first_sets(gram.items)
     return gram
 
 PYTHON_PARSER = python_grammar()