[Python-checkins] r52995 - in sandbox/trunk/2to3: PatternGrammar.txt fix_apply.py fix_has_key.py patcomp.py pgen2/driver.py pgen2/pgen.py pytree.py

Mon Dec 11 04:57:44 CET 2006

Author: guido.van.rossum
Date: Mon Dec 11 04:57:36 2006
New Revision: 52995

Modified:
   sandbox/trunk/2to3/PatternGrammar.txt
   sandbox/trunk/2to3/fix_apply.py
   sandbox/trunk/2to3/fix_has_key.py
   sandbox/trunk/2to3/patcomp.py
   sandbox/trunk/2to3/pgen2/driver.py
   sandbox/trunk/2to3/pgen2/pgen.py
   sandbox/trunk/2to3/pytree.py
Log:
Use a fixed number instead of sys.maxint.
Change pattern repr.
Add some pattern optimizations.
Allow {N} in pattern syntax.
Refactor parser driver to allow a different source of tokens than tokenize.
Add a filter for tokenize that removes significant whitespace
(so it can be insignificant for patterns).
Close the stream from which the grammar is read if we opened it.


Modified: sandbox/trunk/2to3/PatternGrammar.txt
==============================================================================

--- sandbox/trunk/2to3/PatternGrammar.txt	(original)
+++ sandbox/trunk/2to3/PatternGrammar.txt	Mon Dec 11 04:57:36 2006
@@ -23,6 +23,6 @@
 
 NegatedUnit: 'not' (STRING | NAME [Details] | '(' Alternatives ')')
 
-Repeater: '*' | '+' | '{' NUMBER ',' NUMBER '}'
+Repeater: '*' | '+' | '{' NUMBER [',' NUMBER] '}'
 
 Details: '<' Alternatives '>'

Modified: sandbox/trunk/2to3/fix_apply.py
==============================================================================
--- sandbox/trunk/2to3/fix_apply.py	(original)
+++ sandbox/trunk/2to3/fix_apply.py	Mon Dec 11 04:57:36 2006
@@ -64,7 +64,7 @@
 
 # Tree matching patterns
 pat_compile = patcomp.PatternCompiler().compile_pattern
-p_apply = pat_compile("""(
+p_apply = pat_compile("""
 power< 'apply'
     trailer<
         '('
@@ -76,7 +76,7 @@
         ')'
     >
 >
-)"""
+"""
     )
 
 

Modified: sandbox/trunk/2to3/fix_has_key.py
==============================================================================
--- sandbox/trunk/2to3/fix_has_key.py	(original)
+++ sandbox/trunk/2to3/fix_has_key.py	Mon Dec 11 04:57:36 2006
@@ -55,7 +55,7 @@
 
 # Tree matching pattern
 pat_compile = patcomp.PatternCompiler().compile_pattern
-p_has_key_top = pat_compile("""(
+p_has_key_top = pat_compile("""
 power<
     before=any+
     trailer< '.' 'has_key' >
@@ -68,7 +68,7 @@
     >
     after=any*
 >
-)""")
+""")
 
 def fix_has_key(node):
     results = {}

Modified: sandbox/trunk/2to3/patcomp.py
==============================================================================
--- sandbox/trunk/2to3/patcomp.py	(original)
+++ sandbox/trunk/2to3/patcomp.py	Mon Dec 11 04:57:36 2006
@@ -10,9 +10,9 @@
 
 __author__ = "Guido van Rossum <guido at python.org>"
 
-# Python tokens
-import sys
+# Python imports
 import token
+import tokenize
 
 # Fairly local imports
 from pgen2 import driver
@@ -35,6 +35,16 @@
             setattr(self, name, grammar.symbol2number[name])
 
 
+def tokenize_wrapper(input):
+    """Tokenizes a string suppressing significant whitespace."""
+    skip = (token.NEWLINE, token.INDENT, token.DEDENT)
+    tokens = tokenize.generate_tokens(driver.generate_lines(input).next)
+    for quintuple in tokens:
+        type, value, start, end, line_text = quintuple
+        if type not in skip:
+            yield quintuple
+
+
 class PatternCompiler(object):
 
     def __init__(self, grammar_file="PatternGrammar.txt"):
@@ -50,7 +60,8 @@
 
     def compile_pattern(self, input, debug=False):
         """Compiles a pattern string to a nested pytree.*Pattern object."""
-        root = self.driver.parse_string(input, debug=debug)
+        tokens = tokenize_wrapper(input)
+        root = self.driver.parse_tokens(tokens, debug=debug)
         return self.compile_node(root)
 
     def compile_node(self, node):
@@ -58,22 +69,30 @@
 
         This is one big switch on the node type.
         """
-        # XXX Leave the optimizations to later
+        # XXX Optimize certain Wildcard-containing-Wildcard patterns
+        # that can be merged
         if node.type == self.syms.Matcher:
             node = node.children[0] # Avoid unneeded recursion
 
         if node.type == self.syms.Alternatives:
             # Skip the odd children since they are just '|' tokens
             alts = [self.compile_node(ch) for ch in node.children[::2]]
-            return pytree.WildcardPattern([[a] for a in alts], min=1, max=1)
+            if len(alts) == 1:
+                return alts[0]
+            p = pytree.WildcardPattern([[a] for a in alts], min=1, max=1)
+            return p.optimize()
 
         if node.type == self.syms.Alternative:
             units = [self.compile_node(ch) for ch in node.children]
-            return pytree.WildcardPattern([units], min=1, max=1)
+            if len(units) == 1:
+                return units[0]
+            p = pytree.WildcardPattern([units], min=1, max=1)
+            return p.optimize()
 
         if node.type == self.syms.NegatedUnit:
             pattern = self.compile_basic(node.children[1:])
-            return pytree.NegatedPattern(pattern)
+            p = pytree.NegatedPattern(pattern)
+            return p.optimize()
 
         assert node.type == self.syms.Unit
 
@@ -96,19 +115,25 @@
             child = children[0]
             if child.type == token.STAR:
                 min = 0
-                max = sys.maxint
+                max = pytree.HUGE
             elif child.type == token.PLUS:
                 min = 1
-                max = sys.maxint
+                max = pytree.HUGE
+            elif child.type == token.LBRACE:
+                assert children[-1].type == token.RBRACE
+                assert  len(children) in (3, 5)
+                min = max = self.get_int(children[1])
+                if len(children) == 5:
+                    max = self.get_int(children[3])
             else:
-                assert len(children) == 5
-                assert child.type == token.LBRACE
-                min = self.get_int(children[1])
-                max = self.get_int(children[3])
-            pattern = pytree.WildcardPattern([[pattern]], min=min, max=max)
+                assert False
+            if min != 1 or max != 1:
+                pattern = pattern.optimize()
+                pattern = pytree.WildcardPattern([[pattern]], min=min, max=max)
+
         if name is not None:
             pattern.name = name
-        return pattern
+        return pattern.optimize()
 
     def compile_basic(self, nodes, repeat=None):
         # Compile STRING | NAME [Details] | (...) | [...]
@@ -164,11 +189,15 @@
         return pytree.Leaf(type, value, context=context)
 
 
-def test():
+_SAMPLE = """(a=(power< ('apply' trailer<'(' b=(not STRING) ')'> ) >){1})
+{1,1}"""
+
+
+def _test():
     pc = PatternCompiler()
-    pat = pc.compile_pattern("a=power< 'apply' trailer<'(' b=(not STRING) ')'> >")
+    pat = pc.compile_pattern(_SAMPLE)
     print pat
 
 
 if __name__ == "__main__":
-    test()
+    _test()

Modified: sandbox/trunk/2to3/pgen2/driver.py
==============================================================================
--- sandbox/trunk/2to3/pgen2/driver.py	(original)
+++ sandbox/trunk/2to3/pgen2/driver.py	Mon Dec 11 04:57:36 2006
@@ -25,6 +25,7 @@
 from pgen2 import parse
 from pgen2 import grammar
 
+
 class Driver(object):
 
     def __init__(self, grammar, convert=None, logger=None):
@@ -34,15 +35,16 @@
         self.logger = logger
         self.convert = convert
 
-    def parse_stream_raw(self, stream, debug=False):
-        """Parse a stream and return the concrete syntax tree."""
+    def parse_tokens(self, tokens, debug=False):
+        """Parse a series of tokens and return the syntax tree."""
+        # XXX Move the prefix computation into a wrapper around tokenize.
         p = parse.Parser(self.grammar, self.convert)
         p.setup()
         lineno = 1
         column = 0
         type = value = start = end = line_text = None
         prefix = ""
-        for quintuple in tokenize.generate_tokens(stream.readline):
+        for quintuple in tokens:
             type, value, start, end, line_text = quintuple
             if start != (lineno, column):
                 assert (lineno, column) <= start, ((lineno, column), start)
@@ -80,6 +82,11 @@
             raise parse.ParseError("incomplete input", t, v, x)
         return p.rootnode
 
+    def parse_stream_raw(self, stream, debug=False):
+        """Parse a stream and return the syntax tree."""
+        tokens = tokenize.generate_tokens(stream.readline)
+        return self.parse_tokens(tokens, debug)
+
     def parse_stream(self, stream, debug=False):
         """Parse a stream and return the syntax tree."""
         return self.parse_stream_raw(stream, debug)
@@ -94,9 +101,17 @@
 
     def parse_string(self, text, debug=False):
         """Parse a string and return the syntax tree."""
-        from StringIO import StringIO
-        stream = StringIO(text)
-        return self.parse_stream(stream, debug)
+        tokens = tokenize.generate_tokens(generate_lines(text).next)
+        return self.parse_tokens(tokens, debug)
+
+
+def generate_lines(text):
+    """Generator that behaves like readline without using StringIO."""
+    for line in text.splitlines(True):
+        yield line
+    while True:
+        yield ""
+
 
 def load_grammar(gt="Grammar.txt", gp=None,
                  save=True, force=False, logger=None):
@@ -120,6 +135,7 @@
         g.load(gp)
     return g
 
+
 def _newer(a, b):
     """Inquire whether file a was written since file b."""
     if not os.path.exists(a):

Modified: sandbox/trunk/2to3/pgen2/pgen.py
==============================================================================
--- sandbox/trunk/2to3/pgen2/pgen.py	(original)
+++ sandbox/trunk/2to3/pgen2/pgen.py	Mon Dec 11 04:57:36 2006
@@ -14,13 +14,17 @@
 class ParserGenerator(object):
 
     def __init__(self, filename, stream=None):
+        close_stream = None
         if stream is None:
             stream = open(filename)
+            close_stream = stream.close
         self.filename = filename
         self.stream = stream
         self.generator = tokenize.generate_tokens(stream.readline)
         self.gettoken() # Initialize lookahead
         self.dfas, self.startsymbol = self.parse()
+        if close_stream is not None:
+            close_stream()
         self.first = {} # map from symbol name to set of tokens
         self.addfirstsets()
 

Modified: sandbox/trunk/2to3/pytree.py
==============================================================================
--- sandbox/trunk/2to3/pytree.py	(original)
+++ sandbox/trunk/2to3/pytree.py	Mon Dec 11 04:57:36 2006
@@ -11,7 +11,8 @@
 
 __author__ = "Guido van Rossum <guido at python.org>"
 
-import sys
+
+HUGE = 0x7FFFFFFF  # maximum repeat count, default max
 
 
 class Base(object):
@@ -247,10 +248,17 @@
         return object.__new__(cls, *args, **kwds)
 
     def __repr__(self):
-        return "%s(%s)" % (self.__class__.__name__,
-                           ", ".join("%s=%r" % (x, getattr(self, x))
-                                     for x in ("type", "content", "name")
-                                     if getattr(self, x) is not None))
+        args = [self.type, self.content, self.name]
+        while args and args[-1] is None:
+            del args[-1]
+        return "%s(%s)" % (self.__class__.__name__, ", ".join(map(repr, args)))
+
+    def optimize(self):
+        """A subclass can define this as a hook for optimizations.
+
+        Returns either self or another node with the same effect.
+        """
+        return self
 
     def match(self, node, results=None):
         """Does this pattern exactly match a node?
@@ -410,7 +418,7 @@
     except it always uses non-greedy matching.
     """
 
-    def __init__(self, content=None, min=0, max=sys.maxint, name=None):
+    def __init__(self, content=None, min=0, max=HUGE, name=None):
         """Initializer.
 
         Args:
@@ -418,7 +426,7 @@
                      if absent, matches one node;
                      if present, each subsequence is an alternative [*]
             min: optinal minumum number of times to match, default 0
-            max: optional maximum number of times tro match, default sys.maxint
+            max: optional maximum number of times tro match, default HUGE
             name: optional name assigned to this match
 
         [*] Thus, if content is [[a, b, c], [d, e], [f, g, h]] this is
@@ -432,7 +440,7 @@
             If content is not None, replace the dot with the parenthesized
             list of alternatives, e.g. (a b c | d e | f g h)*
         """
-        assert 0 <= min <= max <= sys.maxint, (min, max)
+        assert 0 <= min <= max <= HUGE, (min, max)
         if content is not None:
             content = tuple(map(tuple, content))  # Protect against alterations
             # Check sanity of alternatives
@@ -444,6 +452,25 @@
         self.max = max
         self.name = name
 
+    def optimize(self):
+        """Optimize certain stacked wildcard patterns."""
+        subpattern = None
+        if (self.content is not None and
+            len(self.content) == 1 and len(self.content[0]) == 1):
+            subpattern = self.content[0][0]
+        if self.min == 1 and self.max == 1:
+            if self.content is None:
+                return NodePattern(name=self.name)
+            if subpattern is not None and  self.name == subpattern.name:
+                return subpattern.optimize()
+        if (self.min <= 1 and isinstance(subpattern, WildcardPattern) and
+            subpattern.min <= 1 and self.name == subpattern.name):
+            return WildcardPattern(subpattern.content,
+                                   self.min*subpattern.min,
+                                   self.max*subpattern.max,
+                                   subpattern.name)
+        return self
+
     def match(self, node, results=None):
         """Does this pattern exactly match a node?"""
         return self.match_seq([node], results)