[pypy-svn] r78901 - pypy/branch/fast-forward/lib-python/2.7.0/lib2to3

afa at codespeak.net afa at codespeak.net
Mon Nov 8 23:46:58 CET 2010


Author: afa
Date: Mon Nov  8 23:46:55 2010
New Revision: 78901

Added:
   pypy/branch/fast-forward/lib-python/2.7.0/lib2to3/btm_matcher.py
   pypy/branch/fast-forward/lib-python/2.7.0/lib2to3/btm_utils.py
Log:
Add missing files in lib2to3


Added: pypy/branch/fast-forward/lib-python/2.7.0/lib2to3/btm_matcher.py
==============================================================================
--- (empty file)
+++ pypy/branch/fast-forward/lib-python/2.7.0/lib2to3/btm_matcher.py	Mon Nov  8 23:46:55 2010
@@ -0,0 +1,168 @@
+"""A bottom-up tree matching algorithm implementation meant to speed
+up 2to3's matching process. After the tree patterns are reduced to
+their rarest linear path, a linear Aho-Corasick automaton is
+created. The linear automaton traverses the linear paths from the
+leaves to the root of the AST and returns a set of nodes for further
+matching. This reduces significantly the number of candidate nodes."""
+
+__author__ = "George Boutsioukis <gboutsioukis at gmail.com>"
+
+import logging
+import itertools
+from collections import defaultdict
+
+from . import pytree
+from .btm_utils import reduce_tree
+
+class BMNode(object):
+    """Class for a node of the Aho-Corasick automaton used in matching"""
+    count = itertools.count()
+    def __init__(self):
+        self.transition_table = {}
+        self.fixers = []
+        self.id = next(BMNode.count)
+        self.content = ''
+
+class BottomMatcher(object):
+    """The main matcher class. After instantiating the patterns should
+    be added using the add_fixer method"""
+
+    def __init__(self):
+        self.match = set()
+        self.root = BMNode()
+        self.nodes = [self.root]
+        self.fixers = []
+        self.logger = logging.getLogger("RefactoringTool")
+
+    def add_fixer(self, fixer):
+        """Reduces a fixer's pattern tree to a linear path and adds it
+        to the matcher(a common Aho-Corasick automaton). The fixer is
+        appended on the matching states and called when they are
+        reached"""
+        self.fixers.append(fixer)
+        tree = reduce_tree(fixer.pattern_tree)
+        linear = tree.get_linear_subpattern()
+        match_nodes = self.add(linear, start=self.root)
+        for match_node in match_nodes:
+            match_node.fixers.append(fixer)
+
+    def add(self, pattern, start):
+        "Recursively adds a linear pattern to the AC automaton"
+        #print("adding pattern", pattern, "to", start)
+        if not pattern:
+            #print("empty pattern")
+            return [start]
+        if isinstance(pattern[0], tuple):
+            #alternatives
+            #print("alternatives")
+            match_nodes = []
+            for alternative in pattern[0]:
+                #add all alternatives, and add the rest of the pattern
+                #to each end node
+                end_nodes = self.add(alternative, start=start)
+                for end in end_nodes:
+                    match_nodes.extend(self.add(pattern[1:], end))
+            return match_nodes
+        else:
+            #single token
+            #not last
+            if pattern[0] not in start.transition_table:
+                #transition did not exist, create new
+                next_node = BMNode()
+                start.transition_table[pattern[0]] = next_node
+            else:
+                #transition exists already, follow
+                next_node = start.transition_table[pattern[0]]
+
+            if pattern[1:]:
+                end_nodes = self.add(pattern[1:], start=next_node)
+            else:
+                end_nodes = [next_node]
+            return end_nodes
+
+    def run(self, leaves):
+        """The main interface with the bottom matcher. The tree is
+        traversed from the bottom using the constructed
+        automaton. Nodes are only checked once as the tree is
+        retraversed. When the automaton fails, we give it one more
+        shot(in case the above tree matches as a whole with the
+        rejected leaf), then we break for the next leaf. There is the
+        special case of multiple arguments(see code comments) where we
+        recheck the nodes
+
+        Args:
+           The leaves of the AST tree to be matched
+
+        Returns:
+           A dictionary of node matches with fixers as the keys
+        """
+        current_ac_node = self.root
+        results = defaultdict(list)
+        for leaf in leaves:
+            current_ast_node = leaf
+            while current_ast_node:
+                current_ast_node.was_checked = True
+                for child in current_ast_node.children:
+                    # multiple statements, recheck
+                    if isinstance(child, pytree.Leaf) and child.value == u";":
+                        current_ast_node.was_checked = False
+                        break
+                if current_ast_node.type == 1:
+                    #name
+                    node_token = current_ast_node.value
+                else:
+                    node_token = current_ast_node.type
+
+                if node_token in current_ac_node.transition_table:
+                    #token matches
+                    current_ac_node = current_ac_node.transition_table[node_token]
+                    for fixer in current_ac_node.fixers:
+                        if not fixer in results:
+                            results[fixer] = []
+                        results[fixer].append(current_ast_node)
+
+                else:
+                    #matching failed, reset automaton
+                    current_ac_node = self.root
+                    if (current_ast_node.parent is not None
+                        and current_ast_node.parent.was_checked):
+                        #the rest of the tree upwards has been checked, next leaf
+                        break
+
+                    #recheck the rejected node once from the root
+                    if node_token in current_ac_node.transition_table:
+                        #token matches
+                        current_ac_node = current_ac_node.transition_table[node_token]
+                        for fixer in current_ac_node.fixers:
+                            if not fixer in results.keys():
+                                results[fixer] = []
+                            results[fixer].append(current_ast_node)
+
+                current_ast_node = current_ast_node.parent
+        return results
+
+    def print_ac(self):
+        "Prints a graphviz diagram of the BM automaton(for debugging)"
+        print("digraph g{")
+        def print_node(node):
+            for subnode_key in node.transition_table.keys():
+                subnode = node.transition_table[subnode_key]
+                print("%d -> %d [label=%s] //%s" %
+                      (node.id, subnode.id, type_repr(subnode_key), str(subnode.fixers)))
+                if subnode_key == 1:
+                    print(subnode.content)
+                print_node(subnode)
+        print_node(self.root)
+        print("}")
+
+# taken from pytree.py for debugging; only used by print_ac
+_type_reprs = {}
+def type_repr(type_num):
+    global _type_reprs
+    if not _type_reprs:
+        from .pygram import python_symbols
+        # printing tokens is possible but not as useful
+        # from .pgen2 import token // token.__dict__.items():
+        for name, val in python_symbols.__dict__.items():
+            if type(val) == int: _type_reprs[val] = name
+    return _type_reprs.setdefault(type_num, type_num)

Added: pypy/branch/fast-forward/lib-python/2.7.0/lib2to3/btm_utils.py
==============================================================================
--- (empty file)
+++ pypy/branch/fast-forward/lib-python/2.7.0/lib2to3/btm_utils.py	Mon Nov  8 23:46:55 2010
@@ -0,0 +1,283 @@
+"Utility functions used by the btm_matcher module"
+
+from . import pytree
+from .pgen2 import grammar, token
+from .pygram import pattern_symbols, python_symbols
+
+syms = pattern_symbols
+pysyms = python_symbols
+tokens = grammar.opmap
+token_labels = token
+
+TYPE_ANY = -1
+TYPE_ALTERNATIVES = -2
+TYPE_GROUP = -3
+
+class MinNode(object):
+    """This class serves as an intermediate representation of the
+    pattern tree during the conversion to sets of leaf-to-root
+    subpatterns"""
+
+    def __init__(self, type=None, name=None):
+        self.type = type
+        self.name = name
+        self.children = []
+        self.leaf = False
+        self.parent = None
+        self.alternatives = []
+        self.group = []
+
+    def __repr__(self):
+        return str(self.type) + ' ' + str(self.name)
+
+    def leaf_to_root(self):
+        """Internal method. Returns a characteristic path of the
+        pattern tree. This method must be run for all leaves until the
+        linear subpatterns are merged into a single"""
+        node = self
+        subp = []
+        while node:
+            if node.type == TYPE_ALTERNATIVES:
+                node.alternatives.append(subp)
+                if len(node.alternatives) == len(node.children):
+                    #last alternative
+                    subp = [tuple(node.alternatives)]
+                    node.alternatives = []
+                    node = node.parent
+                    continue
+                else:
+                    node = node.parent
+                    subp = None
+                    break
+
+            if node.type == TYPE_GROUP:
+                node.group.append(subp)
+                #probably should check the number of leaves
+                if len(node.group) == len(node.children):
+                    subp = get_characteristic_subpattern(node.group)
+                    node.group = []
+                    node = node.parent
+                    continue
+                else:
+                    node = node.parent
+                    subp = None
+                    break
+
+            if node.type == token_labels.NAME and node.name:
+                #in case of type=name, use the name instead
+                subp.append(node.name)
+            else:
+                subp.append(node.type)
+
+            node = node.parent
+        return subp
+
+    def get_linear_subpattern(self):
+        """Drives the leaf_to_root method. The reason that
+        leaf_to_root must be run multiple times is because we need to
+        reject 'group' matches; for example the alternative form
+        (a | b c) creates a group [b c] that needs to be matched. Since
+        matching multiple linear patterns overcomes the automaton's
+        capabilities, leaf_to_root merges each group into a single
+        choice based on 'characteristic'ity,
+
+        i.e. (a|b c) -> (a|b) if b more characteristic than c
+
+        Returns: The most 'characteristic'(as defined by
+          get_characteristic_subpattern) path for the compiled pattern
+          tree.
+        """
+
+        for l in self.leaves():
+            subp = l.leaf_to_root()
+            if subp:
+                return subp
+
+    def leaves(self):
+        "Generator that returns the leaves of the tree"
+        for child in self.children:
+            for x in child.leaves():
+                yield x
+        if not self.children:
+            yield self
+
+def reduce_tree(node, parent=None):
+    """
+    Internal function. Reduces a compiled pattern tree to an
+    intermediate representation suitable for feeding the
+    automaton. This also trims off any optional pattern elements(like
+    [a], a*).
+    """
+
+    new_node = None
+    #switch on the node type
+    if node.type == syms.Matcher:
+        #skip
+        node = node.children[0]
+
+    if node.type == syms.Alternatives  :
+        #2 cases
+        if len(node.children) <= 2:
+            #just a single 'Alternative', skip this node
+            new_node = reduce_tree(node.children[0], parent)
+        else:
+            #real alternatives
+            new_node = MinNode(type=TYPE_ALTERNATIVES)
+            #skip odd children('|' tokens)
+            for child in node.children:
+                if node.children.index(child)%2:
+                    continue
+                reduced = reduce_tree(child, new_node)
+                if reduced is not None:
+                    new_node.children.append(reduced)
+    elif node.type == syms.Alternative:
+        if len(node.children) > 1:
+
+            new_node = MinNode(type=TYPE_GROUP)
+            for child in node.children:
+                reduced = reduce_tree(child, new_node)
+                if reduced:
+                    new_node.children.append(reduced)
+            if not new_node.children:
+                # delete the group if all of the children were reduced to None
+                new_node = None
+
+        else:
+            new_node = reduce_tree(node.children[0], parent)
+
+    elif node.type == syms.Unit:
+        if (isinstance(node.children[0], pytree.Leaf) and
+            node.children[0].value == '('):
+            #skip parentheses
+            return reduce_tree(node.children[1], parent)
+        if ((isinstance(node.children[0], pytree.Leaf) and
+               node.children[0].value == '[')
+               or
+               (len(node.children)>1 and
+               hasattr(node.children[1], "value") and
+               node.children[1].value == '[')):
+            #skip whole unit if its optional
+            return None
+
+        leaf = True
+        details_node = None
+        alternatives_node = None
+        has_repeater = False
+        repeater_node = None
+        has_variable_name = False
+
+        for child in node.children:
+            if child.type == syms.Details:
+                leaf = False
+                details_node = child
+            elif child.type == syms.Repeater:
+                has_repeater = True
+                repeater_node = child
+            elif child.type == syms.Alternatives:
+                alternatives_node = child
+            if hasattr(child, 'value') and child.value == '=': # variable name
+                has_variable_name = True
+
+        #skip variable name
+        if has_variable_name:
+            #skip variable name, '='
+            name_leaf = node.children[2]
+            if hasattr(name_leaf, 'value') and name_leaf.value == '(':
+                # skip parenthesis
+                name_leaf = node.children[3]
+        else:
+            name_leaf = node.children[0]
+
+        #set node type
+        if name_leaf.type == token_labels.NAME:
+            #(python) non-name or wildcard
+            if name_leaf.value == 'any':
+                new_node = MinNode(type=TYPE_ANY)
+            else:
+                if hasattr(token_labels, name_leaf.value):
+                    new_node = MinNode(type=getattr(token_labels, name_leaf.value))
+                else:
+                    new_node = MinNode(type=getattr(pysyms, name_leaf.value))
+
+        elif name_leaf.type == token_labels.STRING:
+            #(python) name or character; remove the apostrophes from
+            #the string value
+            name = name_leaf.value.strip("'")
+            if name in tokens:
+                new_node = MinNode(type=tokens[name])
+            else:
+                new_node = MinNode(type=token_labels.NAME, name=name)
+        elif name_leaf.type == syms.Alternatives:
+            new_node = reduce_tree(alternatives_node, parent)
+
+        #handle repeaters
+        if has_repeater:
+            if repeater_node.children[0].value == '*':
+                #reduce to None
+                new_node = None
+            elif repeater_node.children[0].value == '+':
+                #reduce to a single occurence i.e. do nothing
+                pass
+            else:
+                #TODO: handle {min, max} repeaters
+                raise NotImplementedError
+                pass
+
+        #add children
+        if details_node and new_node is not None:
+            for child in details_node.children[1:-1]:
+                #skip '<', '>' markers
+                reduced = reduce_tree(child, new_node)
+                if reduced is not None:
+                    new_node.children.append(reduced)
+    if new_node:
+        new_node.parent = parent
+    return new_node
+
+
+def get_characteristic_subpattern(subpatterns):
+    """Picks the most characteristic from a list of linear patterns
+    Current order used is:
+    names > common_names > common_chars
+    """
+    if not isinstance(subpatterns, list):
+        return subpatterns
+    if len(subpatterns)==1:
+        return subpatterns[0]
+
+    # first pick out the ones containing variable names
+    subpatterns_with_names = []
+    subpatterns_with_common_names = []
+    common_names = ['in', 'for', 'if' , 'not', 'None']
+    subpatterns_with_common_chars = []
+    common_chars = "[]().,:"
+    for subpattern in subpatterns:
+        if any(rec_test(subpattern, lambda x: type(x) is str)):
+            if any(rec_test(subpattern,
+                            lambda x: isinstance(x, str) and x in common_chars)):
+                subpatterns_with_common_chars.append(subpattern)
+            elif any(rec_test(subpattern,
+                              lambda x: isinstance(x, str) and x in common_names)):
+                subpatterns_with_common_names.append(subpattern)
+
+            else:
+                subpatterns_with_names.append(subpattern)
+
+    if subpatterns_with_names:
+        subpatterns = subpatterns_with_names
+    elif subpatterns_with_common_names:
+        subpatterns = subpatterns_with_common_names
+    elif subpatterns_with_common_chars:
+        subpatterns = subpatterns_with_common_chars
+    # of the remaining subpatterns pick out the longest one
+    return max(subpatterns, key=len)
+
+def rec_test(sequence, test_func):
+    """Tests test_func on all items of sequence and items of included
+    sub-iterables"""
+    for x in sequence:
+        if isinstance(x, (list, tuple)):
+            for y in rec_test(x, test_func):
+                yield y
+        else:
+            yield test_func(x)



More information about the Pypy-commit mailing list