[pypy-svn] r12814 - pypy/branch/pycompiler/module/recparser
adim at codespeak.net
adim at codespeak.net
Thu May 26 19:18:06 CEST 2005
Author: adim
Date: Thu May 26 19:18:06 2005
New Revision: 12814
Modified:
pypy/branch/pycompiler/module/recparser/ebnflexer.py
pypy/branch/pycompiler/module/recparser/ebnfparse.py
pypy/branch/pycompiler/module/recparser/grammar.py
pypy/branch/pycompiler/module/recparser/pythonlexer.py
pypy/branch/pycompiler/module/recparser/pythonparse.py
Log:
- added lookehead (don't try to match rule when the next token
is not in the rule's first set)
- added small tests
- Still unfinished, and breaks (for now) test_samples.py
Modified: pypy/branch/pycompiler/module/recparser/ebnflexer.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/ebnflexer.py (original)
+++ pypy/branch/pycompiler/module/recparser/ebnflexer.py Thu May 26 19:18:06 2005
@@ -21,6 +21,7 @@
TokenSource.__init__(self)
self.input = inpstring
self.pos = 0
+ self._peeked = None
def context(self):
return self.pos
@@ -36,6 +37,10 @@
self.pos = ctx
def next(self):
+ if self._peeked is not None:
+ self._peeked = None
+ return self._peeked
+
pos = self.pos
inp = self.input
m = g_skip.match(inp, pos)
@@ -67,5 +72,11 @@
return 'SYMBOL',tk
raise ValueError("Unknown token at pos=%d context='%s'" % (pos,inp[pos:pos+20]) )
+ def peek(self):
+## self._peeked = None
+## self._peeked = self.next()
+## return self._peeked
+ return None
+
def debug(self):
return self.input[self.pos:self.pos+20]
Modified: pypy/branch/pycompiler/module/recparser/ebnfparse.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/ebnfparse.py (original)
+++ pypy/branch/pycompiler/module/recparser/ebnfparse.py Thu May 26 19:18:06 2005
@@ -32,7 +32,7 @@
Token.__init__(self, "NAME")
self.keywords = keywords
- def match(self, source, builder):
+ def match(self, source, builder, level=0):
"""Matches a token.
the default implementation is to match any token whose type
corresponds to the object's name. You can extend Token
Modified: pypy/branch/pycompiler/module/recparser/grammar.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/grammar.py (original)
+++ pypy/branch/pycompiler/module/recparser/grammar.py Thu May 26 19:18:06 2005
@@ -38,6 +38,22 @@
######################################################################
+
+def build_first_sets(rules):
+ # PSEUDO CODE
+ changed = True
+ loops = 0
+ while changed:
+ loops += 1
+ changed = False
+ for rule in rules:
+ size = len(rule.first_set)
+ rule.calc_first_set()
+ new_size = len(rule.first_set)
+ if new_size != size:
+ changed = True
+ print "Done", loops, "loops"
+
from syntaxtree import SyntaxNode, TempSyntaxNode, TokenNode
class BaseGrammarBuilder(object):
@@ -102,6 +118,9 @@
self.name = name
self.args = []
self._is_root = False
+ self.first_set = []
+ self.first_set_complete = False
+ self._processing = False
def is_root(self):
"""This is a root node of the grammar, that is one that will
@@ -110,7 +129,35 @@
return False
return True
- def match(self, source, builder):
+ def match(self, source, builder, level=0):
+ """Try to match a grammar rule
+
+ If next set of tokens matches this grammar element, use <builder>
+ to build an appropriate object, otherwise returns None.
+
+ /!\ If the sets of element didn't match the current grammar
+ element, then the <source> is restored as it was before the
+ call to the match() method
+
+ returns None if no match or an object build by builder
+ """
+ tok_tuple = source.peek()
+ # <tmpdebug>
+ if tok_tuple is not None:
+ if tok_tuple not in self.first_set:
+ prefix = '%s<<<' % (' ' * level)
+ else:
+ prefix = '%s>>>' % (' ' * level)
+ print prefix, " TOKEN =", tok_tuple
+ print prefix, " RULE =", self
+ print prefix, " FIRST SET =", self.first_set
+ print "*" * 50
+ # </tmpdebug>
+ if tok_tuple is not None and EmptyToken not in self.first_set and tok_tuple not in self.first_set:
+ return None
+ return self._match(source, builder, level)
+
+ def _match(self, source, builder, level=0):
"""Try to match a grammar rule
If next set of tokens matches this grammar element, use <builder>
@@ -131,13 +178,13 @@
# To consider if we need to improve speed in parsing
pass
- def first_set(self):
- """Returns a list of possible tokens that can start this rule
- None means the rule can be empty
- """
- # **NOT USED** **NOT IMPLEMENTED**
- # To consider if we need to improve speed in parsing
- pass
+## def calc_first_set(self):
+## """Returns a list of possible tokens that can start this rule
+## None means the rule can be empty
+## """
+## # **NOT USED** **NOT IMPLEMENTED**
+## # To consider if we need to improve speed in parsing
+## pass
def __str__(self):
return self.display(0)
@@ -159,6 +206,15 @@
print "matched %s (%s): %s" % (self.__class__.__name__, sargs, self.display() )
return ret
+
+ def calc_first_set(self):
+ """returns the list of possible next tokens
+ *must* be implemented in subclasses
+ """
+ # XXX: first_set could probably be implemented with sets
+ return []
+
+
class Alternative(GrammarElement):
"""Represents an alternative in a grammar rule (as in S -> A | B | C)"""
def __init__(self, name, *args):
@@ -167,17 +223,21 @@
for i in self.args:
assert isinstance( i, GrammarElement )
- def match(self, source, builder):
+ def _match(self, source, builder, level=0):
"""If any of the rules in self.args matches
returns the object built from the first rules that matches
"""
if DEBUG>1:
print "try alt:", self.display()
+ tok = source.peek()
# Here we stop at the first match we should
# try instead to get the longest alternative
# to see if this solve our problems with infinite recursion
for rule in self.args:
- m = rule.match( source, builder )
+ if tok is not None and tok not in rule.first_set:
+ print "Skipping impossible rule: %s" % (rule,)
+ continue
+ m = rule.match(source, builder, level+1)
if m:
ret = builder.alternative( self, source )
return self.debug_return( ret )
@@ -192,7 +252,18 @@
name = ""
items = [ a.display(1) for a in self.args ]
return name+"(" + "|".join( items ) + ")"
-
+
+ def calc_first_set(self):
+ """returns the list of possible next tokens
+ if S -> (A | B | C):
+ LAH(S) = Union( LAH(A), LAH(B), LAH(C) )
+ """
+ # do this to avoid problems on indirect recursive rules
+ for rule in self.args:
+ for t in rule.first_set:
+ if t not in self.first_set:
+ self.first_set.append(t)
+
class Sequence(GrammarElement):
"""Reprensents a Sequence in a grammar rule (as in S -> A B C)"""
@@ -202,14 +273,20 @@
for i in self.args:
assert isinstance( i, GrammarElement )
- def match(self, source, builder):
+ def _match(self, source, builder, level=0):
"""matches all of the symbols in order"""
if DEBUG>1:
print "try seq:", self.display()
ctx = source.context()
bctx = builder.context()
+ if self.name == 'listmaker':
+ print "----------------------------- LISTMAKER !"
for rule in self.args:
- m = rule.match(source, builder)
+ if self.name == 'listmaker':
+ print " -------------- IN LISTMAKER, rule =", rule
+ m = rule.match(source, builder, level+1)
+ if self.name == 'listmaker':
+ print " !!!!!!!!!!!!!! IN LISTMAKER, doesn't match %s" % (rule,)
if not m:
# Restore needed because some rules may have been matched
# before the one that failed
@@ -229,6 +306,24 @@
items = [a.display(1) for a in self.args]
return name + "(" + " ".join( items ) + ")"
+ def calc_first_set(self):
+ """returns the list of possible next tokens
+ if S -> A* B C:
+ LAH(S) = Union( LAH(A), LAH(B) )
+ if S -> A+ B C:
+ LAH(S) = LAH(A)
+ if S -> A B C:
+ LAH(S) = LAH(A)
+ """
+ for rule in self.args:
+ # while we're in this loop, keep agregating possible tokens
+ for t in rule.first_set:
+ if t not in self.first_set:
+ self.first_set.append(t)
+ if EmptyToken not in rule.first_set:
+ break
+
+
class KleenStar(GrammarElement):
"""Represents a KleenStar in a grammar rule as in (S -> A+) or (S -> A*)"""
def __init__(self, name, _min = 0, _max = -1, rule=None):
@@ -239,8 +334,10 @@
raise ValueError("KleenStar needs max==-1 or max>1")
self.max = _max
self.star = "x"
+ if self.min == 0:
+ self.first_set.append( EmptyToken )
- def match(self, source, builder):
+ def _match(self, source, builder, level=0):
"""matches a number of times self.args[0]. the number must be comprised
between self._min and self._max inclusive. -1 is used to represent infinity"""
if DEBUG>1:
@@ -250,7 +347,7 @@
rules = 0
rule = self.args[0]
while True:
- m = rule.match(source, builder)
+ m = rule.match(source, builder, level+1)
if not m:
# Rule should be matched at least 'min' times
if rules<self.min:
@@ -281,14 +378,27 @@
s = self.args[0].display(1)
return name + "%s%s" % (s, star)
-
+
+ def calc_first_set(self):
+ """returns the list of possible next tokens
+ if S -> A*:
+ LAH(S) = Union( LAH(A), EmptyToken )
+ if S -> A+:
+ LAH(S) = LAH(A)
+ """
+ rule = self.args[0]
+ self.first_set = rule.first_set[:]
+ if self.min == 0 and EmptyToken not in self.first_set:
+ self.first_set.append(EmptyToken)
+
class Token(GrammarElement):
"""Represents a Token in a grammar rule (a lexer token)"""
def __init__( self, name, value = None):
GrammarElement.__init__( self, name )
self.value = value
+ self.first_set = [self]
- def match(self, source, builder):
+ def _match(self, source, builder, level=0):
"""Matches a token.
the default implementation is to match any token whose type
corresponds to the object's name. You can extend Token
@@ -301,7 +411,7 @@
"""
ctx = source.context()
tk_type, tk_value = source.next()
- if tk_type==self.name:
+ if tk_type == self.name:
if self.value is None:
ret = builder.token( tk_type, tk_value, source )
return self.debug_return( ret, tk_type )
@@ -320,3 +430,28 @@
return "<%s>=='%s'" % (self.name, self.value)
+ def __eq__(self, other):
+ """convenience '==' implementation, this is *not* a *real* equality test
+ a Token instance can be compared to:
+ - another Token instance in which case all fields (name and value)
+ must be equal
+ - a tuple, such as those yielded by the Python lexer, in which case
+ the comparison algorithm is similar to the one in match()
+ XXX refactor match and __eq__ ?
+ """
+ if other is None:
+ return False
+ elif isinstance(other, Token):
+ return self.value == other.value and self.name == other.name
+ elif isinstance(other, tuple) and len(other) == 2:
+ tk_type, tk_value = other
+ return tk_type == self.name and self.value in (None, tk_value)
+ return False
+
+
+ def calc_first_set(self):
+ """returns the list of possible next tokens
+ """
+ pass
+
+EmptyToken = object() # Token('???')
Modified: pypy/branch/pycompiler/module/recparser/pythonlexer.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/pythonlexer.py (original)
+++ pypy/branch/pycompiler/module/recparser/pythonlexer.py Thu May 26 19:18:06 2005
@@ -223,6 +223,14 @@
if DEBUG:
print "%d/%d: %s, %s" % (self.stack_pos, len(self.stack), tok, val)
return (tok, val)
+
+ def peek(self):
+ """returns next token without consuming it"""
+ ctx = self.context()
+ tok_tuple = self.next()
+ self.restore(ctx)
+ return tok_tuple
+
def end_of_file(self):
"""return DEDENT and ENDMARKER"""
Modified: pypy/branch/pycompiler/module/recparser/pythonparse.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/pythonparse.py (original)
+++ pypy/branch/pycompiler/module/recparser/pythonparse.py Thu May 26 19:18:06 2005
@@ -16,6 +16,8 @@
grammar.DEBUG = 0
gram = parse_grammar( file(PYTHON_GRAMMAR) )
grammar.DEBUG = level
+ # Build first sets for each rule (including anonymous ones)
+ grammar.build_first_sets(gram.items)
return gram
PYTHON_PARSER = python_grammar()
More information about the Pypy-commit
mailing list