[pypy-svn] r12853 - pypy/branch/pycompiler/module/recparser
adim at codespeak.net
adim at codespeak.net
Fri May 27 20:43:13 CEST 2005
Author: adim
Date: Fri May 27 20:43:13 2005
New Revision: 12853
Modified:
pypy/branch/pycompiler/module/recparser/ebnflexer.py
pypy/branch/pycompiler/module/recparser/ebnfparse.py
pypy/branch/pycompiler/module/recparser/grammar.py
Log:
- essentially cleaning ebnflexer.py
- made the LOOK_AHEAD optional
_ improved peek / restore_context in ebnflexer
Modified: pypy/branch/pycompiler/module/recparser/ebnflexer.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/ebnflexer.py (original)
+++ pypy/branch/pycompiler/module/recparser/ebnflexer.py Fri May 27 20:43:13 2005
@@ -24,7 +24,7 @@
self._peeked = None
def context(self):
- return self.pos
+ return self.pos, self._peeked
def offset(self, ctx=None):
if ctx is None:
@@ -33,8 +33,8 @@
assert type(ctx)==int
return ctx
- def restore(self, ctx ):
- self.pos = ctx
+ def restore(self, ctx):
+ self.pos, self._peeked = ctx
def next(self):
if self._peeked is not None:
Modified: pypy/branch/pycompiler/module/recparser/ebnfparse.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/ebnfparse.py (original)
+++ pypy/branch/pycompiler/module/recparser/ebnfparse.py Fri May 27 20:43:13 2005
@@ -250,10 +250,6 @@
rules = [ star, star_opt, symbol, alternative, rule, grammar, sequence,
seq_cont_list, sequence_cont, option, group, alt ]
- for r in rules:
- r._trace = False
- for tk in r.args:
- tk._trace = False
build_first_sets( rules )
return grammar
Modified: pypy/branch/pycompiler/module/recparser/grammar.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/grammar.py (original)
+++ pypy/branch/pycompiler/module/recparser/grammar.py Fri May 27 20:43:13 2005
@@ -9,6 +9,7 @@
"""
DEBUG = 0
+USE_LOOKAHEAD = True
#### Abstract interface for a lexer/tokenizer
class TokenSource(object):
@@ -48,22 +49,29 @@
def build_first_sets(rules):
- # PSEUDO CODE
+ """builds the real first tokens set for each rule in <rules>
+
+ Because a rule can be recursive (directly or indirectly), the
+ *simplest* algorithm to build each first set is to recompute them
+ until Computation(N) = Computation(N-1), N being the number of rounds.
+ As an example, on Python2.3's grammar, we need 19 cycles to compute
+ full first sets.
+ """
changed = True
- loops = 0
while changed:
- loops += 1
+ # loop while one first set is changed
changed = False
for rule in rules:
+ # For each rule, recompute first set
size = len(rule.first_set)
rule.calc_first_set()
new_size = len(rule.first_set)
if new_size != size:
changed = True
- print "Done", loops, "loops"
for r in rules:
r.reorder_rule()
+
from syntaxtree import SyntaxNode, TempSyntaxNode, TokenNode
class BaseGrammarBuilder(object):
@@ -136,9 +144,10 @@
def is_root(self):
"""This is a root node of the grammar, that is one that will
be included in the syntax tree"""
- if self.name!=":" and self.name.startswith(":"):
+ if self.name != ":" and self.name.startswith(":"):
return False
return True
+
def match(self, source, builder, level=0):
"""Try to match a grammar rule
@@ -152,6 +161,9 @@
returns None if no match or an object build by builder
"""
+ if not USE_LOOKAHEAD:
+ return self._match(source, builder, level)
+
token = source.peek()
pos1 = source.get_pos()
in_first_set = self.match_first_set(token)
@@ -159,40 +171,36 @@
if EmptyToken in self.first_set:
ret = builder.sequence(self, source, 0 )
if self._trace:
- prefix = '%seee' % (' ' * level)
- print prefix, " RULE =", self
- print prefix, " TOKEN =", token
- print prefix, " FIRST SET =", self.first_set
+ self._debug_display(token, level, 'eee')
return self.debug_return( ret, 0 )
if self._trace:
- prefix = '%srrr' % (' ' * level)
- print prefix, " RULE =", self
- print prefix, " TOKEN =", token
- print prefix, " FIRST SET =", self.first_set
+ self._debug_display(token, level, 'rrr')
return None
elif self._trace:
- prefix = '%s>>>' % (' ' * level)
- print prefix, " RULE =", self
- print prefix, " TOKEN =", token
- print prefix, " FIRST SET =", self.first_set
-
- # <tmpdebug>
+ self._debug_display(token, level, '>>>')
+
res = self._match(source, builder, level)
if self._trace:
pos2 = source.get_pos()
if res:
- prefix = '%s+++' % (' ' * level)
+ prefix = '+++'
else:
- prefix = '%s---' % (' ' * level)
- print prefix, " RULE =", self
- print prefix, " TOKEN =", token
- print prefix, " FIRST SET =", self.first_set
- print prefix, " TEXT ='%s'" % source.get_source_text(pos1,pos2)
+ prefix = '---'
+ self._debug_display(token, level, prefix)
+ print ' '*level, prefix, " TEXT ='%s'" % (
+ source.get_source_text(pos1,pos2))
if res:
print "*" * 50
- # </tmpdebug>
return res
+ def _debug_display(self, token, level, prefix):
+ """prints context debug informations"""
+ prefix = '%s%s' % (' ' * level, prefix)
+ print prefix, " RULE =", self
+ print prefix, " TOKEN =", token
+ print prefix, " FIRST SET =", self.first_set
+
+
def _match(self, source, builder, level=0):
"""Try to match a grammar rule
@@ -214,14 +222,6 @@
# To consider if we need to improve speed in parsing
pass
-## def calc_first_set(self):
-## """Returns a list of possible tokens that can start this rule
-## None means the rule can be empty
-## """
-## # **NOT USED** **NOT IMPLEMENTED**
-## # To consider if we need to improve speed in parsing
-## pass
-
def __str__(self):
return self.display(0)
@@ -287,10 +287,11 @@
# try instead to get the longest alternative
# to see if this solve our problems with infinite recursion
for rule in self.args:
- if not rule.match_first_set(tok) and EmptyToken not in rule.first_set:
- if self._trace:
- print "Skipping impossible rule: %s" % (rule,)
- continue
+ if USE_LOOKAHEAD:
+ if not rule.match_first_set(tok) and EmptyToken not in rule.first_set:
+ if self._trace:
+ print "Skipping impossible rule: %s" % (rule,)
+ continue
m = rule.match(source, builder, level+1)
if m:
ret = builder.alternative( self, source )
@@ -312,32 +313,43 @@
if S -> (A | B | C):
LAH(S) = Union( LAH(A), LAH(B), LAH(C) )
"""
-
# do this to avoid problems on indirect recursive rules
for rule in self.args:
for t in rule.first_set:
if t not in self.first_set:
self.first_set.append(t)
+ # self.first_set[t] = 1
- def reorder_rules(self):
+ def reorder_rule(self):
# take the opportunity to reorder rules in alternatives
# so that rules with Empty in their first set come last
# warn if two rules have empty in their first set
empty_set = []
not_empty_set = []
- for r in self.args:
- if EmptyToken in r.first_set:
- empty_set.append( r )
+ # <tokens> is only needed for warning / debugging purposes
+ tokens_set = []
+ for rule in self.args:
+ if EmptyToken in rule.first_set:
+ empty_set.append(rule)
else:
- not_empty_set.append( r )
- if len(empty_set)>1 and not self._reordered:
+ not_empty_set.append(rule)
+ if DEBUG:
+ # This loop is only neede dfor warning / debugging purposes
+ # It will check if a token is part of several first sets of
+ # a same alternative
+ for token in rule.first_set:
+ if token is not EmptyToken and token in tokens_set:
+ print "Warning, token %s in\n\t%s's first set is part " \
+ "of a previous rule's first set in alternative\n\t" \
+ "%s" % (token, rule, self)
+ tokens_set.append(token)
+ if len(empty_set) > 1 and not self._reordered:
print "Warning: alternative %s has more than one rule matching Empty" % self
self._reordered = True
self.args[:] = not_empty_set
self.args.extend( empty_set )
-
class Sequence(GrammarElement):
"""Reprensents a Sequence in a grammar rule (as in S -> A B C)"""
def __init__(self, name, *args):
@@ -385,10 +397,12 @@
for rule in self.args:
if EmptyToken in self.first_set:
self.first_set.remove( EmptyToken )
+ # del self.first_set[EmptyToken]
# while we're in this loop, keep agregating possible tokens
for t in rule.first_set:
if t not in self.first_set:
self.first_set.append(t)
+ # self.first_set[t] = 1
if EmptyToken not in rule.first_set:
break
@@ -406,6 +420,7 @@
self.star = "x"
if self.min == 0:
self.first_set.append( EmptyToken )
+ # self.first_set[EmptyToken] = 1
def _match(self, source, builder, level=0):
"""matches a number of times self.args[0]. the number must be comprised
@@ -458,8 +473,10 @@
"""
rule = self.args[0]
self.first_set = rule.first_set[:]
+ # self.first_set = dict(rule.first_set)
if self.min == 0 and EmptyToken not in self.first_set:
self.first_set.append(EmptyToken)
+ # self.first_set[EmptyToken] = 1
class Token(GrammarElement):
"""Represents a Token in a grammar rule (a lexer token)"""
@@ -467,8 +484,9 @@
GrammarElement.__init__( self, name )
self.value = value
self.first_set = [self]
+ # self.first_set = {self: 1}
- def _match(self, source, builder, level=0):
+ def match(self, source, builder, level=0):
"""Matches a token.
the default implementation is to match any token whose type
corresponds to the object's name. You can extend Token
More information about the Pypy-commit
mailing list