r52858 - in sandbox/trunk/2to3: Grammar.pickle Grammar.txt pgen2 pgen2/__init__.py pgen2/__init__.pyc pgen2/astnode.py pgen2/conv.py pgen2/driver.py pgen2/grammar.py pgen2/literals.py pgen2/parse.py pgen2/pgen.py pgen2/python.py pgen2/test.py play.py pynode.py

Author: guido.van.rossum Date: Wed Nov 29 18:38:40 2006 New Revision: 52858 Added: sandbox/trunk/2to3/ sandbox/trunk/2to3/Grammar.pickle (contents, props changed) sandbox/trunk/2to3/Grammar.txt (contents, props changed) sandbox/trunk/2to3/pgen2/ sandbox/trunk/2to3/pgen2/__init__.py (contents, props changed) sandbox/trunk/2to3/pgen2/__init__.pyc (contents, props changed) sandbox/trunk/2to3/pgen2/astnode.py (contents, props changed) sandbox/trunk/2to3/pgen2/conv.py (contents, props changed) sandbox/trunk/2to3/pgen2/driver.py (contents, props changed) sandbox/trunk/2to3/pgen2/grammar.py (contents, props changed) sandbox/trunk/2to3/pgen2/literals.py (contents, props changed) sandbox/trunk/2to3/pgen2/parse.py (contents, props changed) sandbox/trunk/2to3/pgen2/pgen.py (contents, props changed) sandbox/trunk/2to3/pgen2/python.py (contents, props changed) sandbox/trunk/2to3/pgen2/test.py (contents, props changed) sandbox/trunk/2to3/play.py (contents, props changed) sandbox/trunk/2to3/pynode.py (contents, props changed) Log: Checkpoint of alternative Python 2.x-to-3.0 conversion tool. This contains a modified copy of pgen2 which was open-sourced by Elemental Security through a contributor's agreement with the PSF. Added: sandbox/trunk/2to3/Grammar.pickle ============================================================================== Binary file. No diff available. Added: sandbox/trunk/2to3/Grammar.txt ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/Grammar.txt Wed Nov 29 18:38:40 2006 @@ -0,0 +1,148 @@ +# Grammar for Python + +# Note: Changing the grammar specified in this file will most likely +# require corresponding changes in the parser module +# (../Modules/parsermodule.c). If you can't make the changes to +# that module yourself, please co-ordinate the required changes +# with someone who can; ask around on python-dev for help. Fred +# Drake <fdrake@acm.org> will probably be listening there. + +# NOTE WELL: You should also follow all the steps listed in PEP 306, +# "How to Change Python's Grammar" + +# Commands for Kees Blom's railroad program +#diagram:token NAME +#diagram:token NUMBER +#diagram:token STRING +#diagram:token NEWLINE +#diagram:token ENDMARKER +#diagram:token INDENT +#diagram:output\input python.bla +#diagram:token DEDENT +#diagram:output\textwidth 20.04cm\oddsidemargin 0.0cm\evensidemargin 0.0cm +#diagram:rules + +# Start symbols for the grammar: +# file_input is a module or sequence of commands read from an input file; +# single_input is a single interactive statement; +# eval_input is the input for the eval() and input() functions. +# NB: compound_stmt in single_input is followed by extra NEWLINE! +file_input: (NEWLINE | stmt)* ENDMARKER +single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE +eval_input: testlist NEWLINE* ENDMARKER + +decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE +decorators: decorator+ +funcdef: [decorators] 'def' NAME parameters ':' suite +parameters: '(' [varargslist] ')' +varargslist: ((fpdef ['=' test] ',')* + ('*' NAME [',' '**' NAME] | '**' NAME) | + fpdef ['=' test] (',' fpdef ['=' test])* [',']) +fpdef: NAME | '(' fplist ')' +fplist: fpdef (',' fpdef)* [','] + +stmt: simple_stmt | compound_stmt +simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE +small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | + import_stmt | global_stmt | exec_stmt | assert_stmt) +expr_stmt: testlist (augassign (yield_expr|testlist) | + ('=' (yield_expr|testlist))*) +augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | + '<<=' | '>>=' | '**=' | '//=') +# For normal assignments, additional restrictions enforced by the interpreter +print_stmt: 'print' ( [ test (',' test)* [','] ] | + '>>' test [ (',' test)+ [','] ] ) +del_stmt: 'del' exprlist +pass_stmt: 'pass' +flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: 'break' +continue_stmt: 'continue' +return_stmt: 'return' [testlist] +yield_stmt: yield_expr +raise_stmt: 'raise' [test [',' test [',' test]]] +import_stmt: import_name | import_from +import_name: 'import' dotted_as_names +import_from: ('from' ('.'* dotted_name | '.'+) + 'import' ('*' | '(' import_as_names ')' | import_as_names)) +import_as_name: NAME ['as' NAME] +dotted_as_name: dotted_name ['as' NAME] +import_as_names: import_as_name (',' import_as_name)* [','] +dotted_as_names: dotted_as_name (',' dotted_as_name)* +dotted_name: NAME ('.' NAME)* +global_stmt: 'global' NAME (',' NAME)* +exec_stmt: 'exec' expr ['in' test [',' test]] +assert_stmt: 'assert' test [',' test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef +if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite] +while_stmt: 'while' test ':' suite ['else' ':' suite] +for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite] +try_stmt: ('try' ':' suite + ((except_clause ':' suite)+ + ['else' ':' suite] + ['finally' ':' suite] | + 'finally' ':' suite)) +with_stmt: 'with' test [ with_var ] ':' suite +with_var: 'as' expr +# NB compile.c makes sure that the default except clause is last +except_clause: 'except' [test [',' test]] +suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT + +# Backward compatibility cruft to support: +# [ x for x in lambda: True, lambda: False if x() ] +# even while also allowing: +# lambda x: 5 if x else 2 +# (But not a mix of the two) +testlist_safe: old_test [(',' old_test)+ [',']] +old_test: or_test | old_lambdef +old_lambdef: 'lambda' [varargslist] ':' old_test + +test: or_test ['if' or_test 'else' test] | lambdef +or_test: and_test ('or' and_test)* +and_test: not_test ('and' not_test)* +not_test: 'not' not_test | comparison +comparison: expr (comp_op expr)* +comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' +expr: xor_expr ('|' xor_expr)* +xor_expr: and_expr ('^' and_expr)* +and_expr: shift_expr ('&' shift_expr)* +shift_expr: arith_expr (('<<'|'>>') arith_expr)* +arith_expr: term (('+'|'-') term)* +term: factor (('*'|'/'|'%'|'//') factor)* +factor: ('+'|'-'|'~') factor | power +power: atom trailer* ['**' factor] +atom: ('(' [yield_expr|testlist_gexp] ')' | + '[' [listmaker] ']' | + '{' [dictmaker] '}' | + '`' testlist1 '`' | + NAME | NUMBER | STRING+) +listmaker: test ( list_for | (',' test)* [','] ) +testlist_gexp: test ( gen_for | (',' test)* [','] ) +lambdef: 'lambda' [varargslist] ':' test +trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME +subscriptlist: subscript (',' subscript)* [','] +subscript: '.' '.' '.' | test | [test] ':' [test] [sliceop] +sliceop: ':' [test] +exprlist: expr (',' expr)* [','] +testlist: test (',' test)* [','] +dictmaker: test ':' test (',' test ':' test)* [','] + +classdef: 'class' NAME ['(' [testlist] ')'] ':' suite + +arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) +argument: test [gen_for] | test '=' test # Really [keyword '='] test + +list_iter: list_for | list_if +list_for: 'for' exprlist 'in' testlist_safe [list_iter] +list_if: 'if' old_test [list_iter] + +gen_iter: gen_for | gen_if +gen_for: 'for' exprlist 'in' or_test [gen_iter] +gen_if: 'if' old_test [gen_iter] + +testlist1: test (',' test)* + +# not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: 'yield' [testlist] Added: sandbox/trunk/2to3/pgen2/__init__.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/__init__.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,4 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""The pgen2 package.""" Added: sandbox/trunk/2to3/pgen2/__init__.pyc ============================================================================== Binary file. No diff available. Added: sandbox/trunk/2to3/pgen2/astnode.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/astnode.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,434 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Syntax tree node definitions. + +There is a class or function corresponding to each terminal and +nonterminal symbol. + +The grammar is pieced together from the docstrings of the +corresponding classes and functions: text between dollar signs is +assumed to be the right-hand side of the grammar rule corresponding to +that class or function. + +We use __slots__ to make the parse tree nodes as small as possible. + +NOTE: EVERY CLASS MUST HAVE A __slots__ DEFINITION, EVEN IF EMPTY! +(If not, the instances will get a __dict__, defeating the purpose of +__slots__ in our case.) + +""" + +# Python imports +import token +import logging + +from pgen2 import grammar + +logger = logging.getLogger("pgen2.astnode") + +class Node(object): + """Abstract base class for all nodes. + + This has no attributes except a context slot which holds the lone + number (or more detailed context info). In the future this might + change this to several slots (e.g. filename, lineno, column, or + even filename, start_lineno, start_column, end_lineno, + end_column). The context is only referenced by two places: the + part of the code that sticks it it, and the part of the code that + reports errors. + + In order to reduce the amount of boilerplate code, the context is + argument is handled by __new__ rather than __init__. There are + also a few subclasses that override __new__ to sometimes avoid + constructing an instance. + + """ + + __slots__ = ["context"] + + def __new__(cls, context, *rest): + assert cls not in (Node, Nonterminal, Terminal, Constant) + obj = object.__new__(cls) + obj.context = context + return obj + + _stretch = False # Set to true to stretch the repr() vertically + + def __repr__(self, repr=repr): + stretch = self._stretch + r = [self.__class__.__name__] + if stretch: + r.append("(\n ") + else: + r.append("(") # ")" -- crutch for Emacs python-mode :-( + cls = self.__class__ + # Get nearest non-empty slots __slots__. This assumes + # *something* has non-empty __slots__ before we reach object + # (which has no __slots__). The class hierarchy guarantees + # this. + slots = cls.__slots__ + while not slots: + cls = cls.__base__ + slots = cls.__slots__ + first = True + for name in slots: + if name == "context": + continue # Skip this + if first: + first = False + else: + if stretch: + r.append(",\n ") + else: + r.append(", ") + try: + value = getattr(self, name) + except AttributeError: + continue + if stretch and isinstance(value, list): + rr = map(repr, value) + rv = "[" + ",\n ".join(rr) + "]" + else: + rv = repr(value) + if stretch: + rv = rv.replace("\n", "\n ") + r.append(rv) + r.append(")") + return "".join(r) + + def __str__(self): + return self.__repr__(repr=str) + +class Nonterminal(Node): + """Abstract base class for nonterminal symbols. + + Nothing beyond Node. + + """ + + __slots__ = [] + + _stretch = True + +class Terminal(Node): + """Abstract base class for terminal symbols. + + Nothing beyond Node. + + """ + + __slots__ = [] + +class Series(Nonterminal): + """Abstract base class for nonterminals like stmts: stmt+.""" + + __slots__ = [] + + def __new__(cls, context, *nodes): + assert cls is not Series + if len(nodes) == 0: + return None + elif len(nodes) == 1: + return nodes[0] + else: + obj = Nonterminal.__new__(cls, context) + obj.initseries(nodes) + return obj + +class Constant(Terminal): + """Abstract base class for constants (e.g. number or string). + + Attributes: + + repr -- a string giving the token, exactly as read from source + + """ + + __slots__ = ["repr"] + + def __init__(self, context, repr): + self.repr = repr + + def __str__(self): + return self.repr + +# Node classes for terminal symbols + +class Name(Terminal): + """Name (e.g. a variable name or an attribute name). + + Attributes: + + name -- a string giving the name. + + """ + + __slots__ = ["name"] + + def __init__(self, context, name): + self.name = name + + def __str__(self): + return self.name + +class Number(Constant): + """Numeric constant. + + Nothing beyond Constant. + + """ + + __slots__ = [] + +class String(Constant): + """String constant. + + Nothing beyond Constant. + + """ + + __slots__ = [] + +# Example nodes and factory functions for nonterminal symbols + +def Program(context, stmts): + """Program is a nonterminal with only one non-trivial child. + + Grammar: $ Expression ENDMARKER $ + + """ + return stmts + +class Expression(Series): + "Grammar: $ BinaryExpression ['?' BinaryExpression ':' BinaryExpression] $" + + __slots__ = ["test", "left", "right"] + + def initseries(self, nodes): + self.test, self.left, self.right = nodes + + def __str__(self): + return "%s ? %s : %s" % (self.test, self.left, self.right) + +class BinaryExpression(Series): + "Grammar: $ Operand (Operator Operand)* $" + + __slots__ = ["left", "op", "right"] + + def initseries(self, stuff): + # Stuff is a list with alternating operands and operators + if len(stuff) == 3: + self.left, self.op, self.right = stuff + return + assert len(stuff) > 1 and len(stuff) % 2 == 1 + # Find the rightmost lowest-priority operator + lowest_i = 1 + lowest_op = stuff[lowest_i] + lowest_pri = lowest_op.priority() + for i in range(3, len(stuff), 2): + op = stuff[i] + pri = op.priority() + if pri <= lowest_pri: + lowest_i = i + lowest_op = op + lowest_pri = pri + self.left = self.__class__(self.context, *stuff[:lowest_i]) + self.op = lowest_op + self.right = self.__class__(self.context, *stuff[lowest_i+1:]) + + def __str__(self): + return "(%s %s %s)" % (self.left, self.op, self.right) + +def Operand(context, arg): + """Operand is a nonterminal with one child. + + Grammar: $ Atom | UnaryExpression $ + + """ + return arg + +class UnaryExpression(Nonterminal): + "Grammar: $ UnaryOperator Operand $" + + __slots__ = ["op", "arg"] + + def __init__(self, context, op, arg): + self.op = op + self.arg = arg + + def __str__(self): + return "%s%s" % (self.op, self.arg) + +def UnaryOperator(context, op): + "Grammar: $ 'not' | '-' $" + return op + +class Operator(Nonterminal): + """Operator. + + This has a repr slot and a priority() method. + + The __new__ method implements a sort-of singleton pattern: there's + only one instance per repr value. (Yes, this means the context is + not useful. Therefore we set it to None.) + + Grammar: $ '**' | '*' | '/' | '+' | '-' | '&' | '^' | '|' | + ['not'] 'in' | '==' | '<' | '>' | '!=' | '<=' | '>=' | + 'and' | 'or' $ + + """ + + __slots__ = ["repr"] + + _stretch = False + + _cache = {} + + def __new__(cls, context, *args): + repr = " ".join(args) + # For "not in", the argument should be the string "not in" + obj = cls._cache.get(repr) + if obj is None: + obj = Terminal.__new__(cls, None) + obj.repr = repr + return obj + + def __str__(self): + return self.repr + + def priority(self): + return self._priorities[self.repr] + + _priorities = { + "or": 0, + "and": 1, + "in": 2, + "not in": 2, + "<": 2, + ">": 2, + "==": 2, + "<=": 2, + ">=": 2, + "!=": 2, + "|": 3, + "^": 4, + "&": 5, + "+": 6, + "-": 6, + "*": 7, + "/": 7, + "**": 8, + } + +def Atom(context, arg): + """ Grammar: $ NAME | STRING | NUMBER | ParenthesizedExpression $ + """ + return arg + +def ParenthesizedExpression(context, expr): + "Grammar: $ '(' Expression ')' $" + return expr + + +# Conversion from concrete to abstract syntax trees + +def vanish(context, value): + return None + +token_mapping = { + # Tokens that become parse tree nodes + # (token.NAME is special-cased in the code) + token.NUMBER: Number, + token.STRING: String, + + # Tokens that vanish + token.DOT: vanish, + token.LPAR: vanish, + token.RPAR: vanish, + token.COLON: vanish, + token.COMMA: vanish, + token.EQUAL: vanish, + token.DEDENT: vanish, + token.INDENT: vanish, + token.LBRACE: vanish, + token.RBRACE: vanish, + token.NEWLINE: vanish, + token.ENDMARKER: vanish, + + # All other tokens return the token's string value (e.g. "+") + } + +vanishing_keywords = { + # E.g. "def": True, + } + +def convert(grammar, node): + type, value, context, children = node + # Is it a non-terminal symbol? + if type in grammar.number2symbol: + symbol = grammar.number2symbol[type] + factory = globals().get(symbol) + if factory is None: + raise RuntimeError("can't find factory for %s (line %s)" % + (symbol, context)) + # Debug variation: + try: + return factory(context, *children) + except: + logger.debug("%s %s", factory.__name__, "(") + for child in children: + logger.debug("%s %s", "==>", child) + logger.debug(")") + logger.debug("# Did you remember to declare a 'context' arg?") + raise + return factory(context, *children) + + # Must be a terminal symbol. + if type == token.NAME: + # Name or keyword. Special-case the snot out of this. + if value in grammar.keywords: + # Keywords become strings, like operators + if value in vanishing_keywords: + return None + else: + return value + else: + return Name(context, value) + + # Look for a handler in the token_mapping table. + assert type in token.tok_name + factory = token_mapping.get(type) + if factory: + return factory(context, value) + else: + return value + + +# Support code + +def generate_grammar(stream): + """Extract the grammar rules from this module's doc strings.""" + import re + from types import ModuleType + lines = [] + startline = None + for name, obj in globals().items(): + if hasattr(obj, "__doc__") and not isinstance(obj, ModuleType): + m = re.search(r"Grammar:\s*\$([^$]+)\$", obj.__doc__ or "") + if m: + rule = obj.__name__, " ".join(m.group(1).split()) + if rule[0] == "Program": + assert not startline + startline = rule + else: + lines.append(rule) + lines.sort() + # The start symbol *must* be the first rule + lines.insert(0, startline) + for name, rhs in lines: + stream.write("%s: %s\n" % (name, rhs)) + +if __name__ == '__main__': + import sys + generate_grammar(sys.stdout) Added: sandbox/trunk/2to3/pgen2/conv.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/conv.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,256 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Convert graminit.[ch] spit out by pgen to Python code. + +Pgen is the Python parser generator. It is useful to quickly create a +parser from a grammar file in Python's grammar notation. But I don't +want my parsers to be written in C (yet), so I'm translating the +parsing tables to Python data structures and writing a Python parse +engine. + +Note that the token numbers are constants determined by the standard +Python tokenizer. The standard token module defines these numbers and +their names (the names are not used much). The token numbers are +hardcoded into the Python tokenizer and into pgen. A Python +implementation of the Python tokenizer is also available, in the +standard tokenize module. + +On the other hand, symbol numbers (representing the grammar's +non-terminals) are assigned by pgen based on the actual grammar +input. + +Note: this module is pretty much obsolete; the pgen module generates +equivalent grammar tables directly from the Grammar.txt input file +without having to invoke the Python pgen C program. + +""" + +# Python imports +import re +import token + +from pgen2 import grammar + +class Converter(grammar.Grammar): + """Grammar subclass that reads classic pgen output files. + + The run() method reads the tables as produced by the pgen parser + generator, typically contained in two C files, graminit.h and + graminit.c. The other methods are for internal use only. + + See the base class for more documentation. + + """ + + def run(self, graminit_h, graminit_c): + """Load the grammar tables from the text files written by pgen.""" + self.parse_graminit_h(graminit_h) + self.parse_graminit_c(graminit_c) + self.finish_off() + + def parse_graminit_h(self, filename): + """Parse the .h file writen by pgen. (Internal) + + This file is a sequence of #define statements defining the + nonterminals of the grammar as numbers. We build two tables + mapping the numbers to names and back. + + """ + try: + f = open(filename) + except IOError, err: + print "Can't open %s: %s" % (filename, err) + return False + self.symbol2number = {} + self.number2symbol = {} + lineno = 0 + for line in f: + lineno += 1 + mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) + if not mo and line.strip(): + print "%s(%s): can't parse %s" % (filename, lineno, + line.strip()) + else: + symbol, number = mo.groups() + number = int(number) + assert symbol not in self.symbol2number + assert number not in self.number2symbol + self.symbol2number[symbol] = number + self.number2symbol[number] = symbol + return True + + def parse_graminit_c(self, filename): + """Parse the .c file writen by pgen. (Internal) + + The file looks as follows. The first two lines are always this: + + #include "pgenheaders.h" + #include "grammar.h" + + After that come four blocks: + + 1) one or more state definitions + 2) a table defining dfas + 3) a table defining labels + 4) a struct defining the grammar + + A state definition has the following form: + - one or more arc arrays, each of the form: + static arc arcs_<n>_<m>[<k>] = { + {<i>, <j>}, + ... + }; + - followed by a state array, of the form: + static state states_<s>[<t>] = { + {<k>, arcs_<n>_<m>}, + ... + }; + + """ + try: + f = open(filename) + except IOError, err: + print "Can't open %s: %s" % (filename, err) + return False + # The code below essentially uses f's iterator-ness! + lineno = 0 + + # Expect the two #include lines + lineno, line = lineno+1, f.next() + assert line == '#include "pgenheaders.h"\n', (lineno, line) + lineno, line = lineno+1, f.next() + assert line == '#include "grammar.h"\n', (lineno, line) + + # Parse the state definitions + lineno, line = lineno+1, f.next() + allarcs = {} + states = [] + while line.startswith("static arc "): + while line.startswith("static arc "): + mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", + line) + assert mo, (lineno, line) + n, m, k = map(int, mo.groups()) + arcs = [] + for _ in range(k): + lineno, line = lineno+1, f.next() + mo = re.match(r"\s+{(\d+), (\d+)},$", line) + assert mo, (lineno, line) + i, j = map(int, mo.groups()) + arcs.append((i, j)) + lineno, line = lineno+1, f.next() + assert line == "};\n", (lineno, line) + allarcs[(n, m)] = arcs + lineno, line = lineno+1, f.next() + mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) + assert mo, (lineno, line) + s, t = map(int, mo.groups()) + assert s == len(states), (lineno, line) + state = [] + for _ in range(t): + lineno, line = lineno+1, f.next() + mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) + assert mo, (lineno, line) + k, n, m = map(int, mo.groups()) + arcs = allarcs[n, m] + assert k == len(arcs), (lineno, line) + state.append(arcs) + states.append(state) + lineno, line = lineno+1, f.next() + assert line == "};\n", (lineno, line) + lineno, line = lineno+1, f.next() + self.states = states + + # Parse the dfas + dfas = {} + mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) + assert mo, (lineno, line) + ndfas = int(mo.group(1)) + for i in range(ndfas): + lineno, line = lineno+1, f.next() + mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', + line) + assert mo, (lineno, line) + symbol = mo.group(2) + number, x, y, z = map(int, mo.group(1, 3, 4, 5)) + assert self.symbol2number[symbol] == number, (lineno, line) + assert self.number2symbol[number] == symbol, (lineno, line) + assert x == 0, (lineno, line) + state = states[z] + assert y == len(state), (lineno, line) + lineno, line = lineno+1, f.next() + mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) + assert mo, (lineno, line) + first = {} + rawbitset = eval(mo.group(1)) + for i, c in enumerate(rawbitset): + byte = ord(c) + for j in range(8): + if byte & (1<<j): + first[i*8 + j] = 1 + dfas[number] = (state, first) + lineno, line = lineno+1, f.next() + assert line == "};\n", (lineno, line) + self.dfas = dfas + + # Parse the labels + labels = [] + lineno, line = lineno+1, f.next() + mo = re.match(r"static label labels\[(\d+)\] = {$", line) + assert mo, (lineno, line) + nlabels = int(mo.group(1)) + for i in range(nlabels): + lineno, line = lineno+1, f.next() + mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) + assert mo, (lineno, line) + x, y = mo.groups() + x = int(x) + if y == "0": + y = None + else: + y = eval(y) + labels.append((x, y)) + lineno, line = lineno+1, f.next() + assert line == "};\n", (lineno, line) + self.labels = labels + + # Parse the grammar struct + lineno, line = lineno+1, f.next() + assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) + lineno, line = lineno+1, f.next() + mo = re.match(r"\s+(\d+),$", line) + assert mo, (lineno, line) + ndfas = int(mo.group(1)) + assert ndfas == len(self.dfas) + lineno, line = lineno+1, f.next() + assert line == "\tdfas,\n", (lineno, line) + lineno, line = lineno+1, f.next() + mo = re.match(r"\s+{(\d+), labels},$", line) + assert mo, (lineno, line) + nlabels = int(mo.group(1)) + assert nlabels == len(self.labels), (lineno, line) + lineno, line = lineno+1, f.next() + mo = re.match(r"\s+(\d+)$", line) + assert mo, (lineno, line) + start = int(mo.group(1)) + assert start in self.number2symbol, (lineno, line) + self.start = start + lineno, line = lineno+1, f.next() + assert line == "};\n", (lineno, line) + try: + lineno, line = lineno+1, f.next() + except StopIteration: + pass + else: + assert 0, (lineno, line) + + def finish_off(self): + """Create additional useful structures. (Internal).""" + self.keywords = {} # map from keyword strings to arc labels + self.tokens = {} # map from numeric token values to arc labels + for ilabel, (type, value) in enumerate(self.labels): + if type == token.NAME and value is not None: + self.keywords[value] = ilabel + elif value is None: + self.tokens[type] = ilabel Added: sandbox/trunk/2to3/pgen2/driver.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/driver.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,105 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2006 Python Software Foundation. All Rights Reserved. + +"""Parser driver. + +This provides a high-level interface to parse a file into a syntax tree. + +""" + +__author__ = "Guido van Rossum <guido@python.org>" + +__all__ = ["Driver", "load_grammar"] + +# Python imports +import os +import token +import logging +import tokenize + +# Pgen imports +from pgen2 import parse +from pgen2 import astnode +from pgen2 import grammar + +class Driver(object): + + def __init__(self, grammar, convert=None, logger=None): + self.grammar = grammar + if logger is None: + logger = logging.getLogger() + self.logger = logger + self.convert = convert + + def parse_stream_raw(self, stream, debug=False): + """Parse a stream and return the concrete syntax tree.""" + p = parse.Parser(self.grammar, self.convert) + p.setup() + t = v = x = None + # (t, v, x, y, z) == (type, value, start, end, line) + for t, v, x, y, z in tokenize.generate_tokens(stream.readline): + if t in (tokenize.COMMENT, tokenize.NL): + continue + if t == token.OP: + t = grammar.opmap[v] + if debug: + self.logger.debug("%s %r", token.tok_name[t], v) + if p.addtoken(t, v, x): + if debug: + self.logger.debug("Stop.") + break + else: + # We never broke out -- EOF is too soon (how can this happen???) + raise parse.ParseError("incomplete input", t, v, x) + return p.rootnode + + def parse_stream(self, stream, debug=False): + """Parse a stream and return the syntax tree.""" + return self.parse_stream_raw(stream, debug) + + def parse_file(self, filename, debug=False): + """Parse a file and return the syntax tree.""" + stream = open(filename) + try: + return self.parse_stream(stream, debug) + finally: + stream.close() + + def parse_string(self, text, debug=False): + """Parse a string and return the syntax tree.""" + from StringIO import StringIO + stream = StringIO(text) + return self.parse_stream(stream, debug) + +def load_grammar(gt="Grammar.txt", gp=None, + save=True, force=False, logger=None): + """Load the grammar (maybe from a pickle).""" + if logger is None: + logger = logging.getLogger() + if gp is None: + head, tail = os.path.splitext(gt) + if tail == ".txt": + tail = "" + gp = head + tail + ".pickle" + if force or not _newer(gp, gt): + logger.info("Generating grammar tables from %s", gt) + from pgen2 import pgen + g = pgen.generate_grammar(gt) + if save: + logger.info("Writing grammar tables to %s", gp) + g.dump(gp) + else: + g = grammar.Grammar() + g.load(gp) + return g + +def _newer(a, b): + """Inquire whether file a was written since file b.""" + if not os.path.exists(a): + return False + if not os.path.exists(b): + return True + return os.path.getmtime(a) >= os.path.getmtime(b) Added: sandbox/trunk/2to3/pgen2/grammar.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/grammar.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,167 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""This module defines the data structures used to represent a grammar. + +These are a bit arcane because they are derived from the data +structures used by Python's 'pgen' parser generator. + +There's also a table here mapping operators to their names in the +token module; the Python tokenize module reports all operators as the +fallback token code OP, but the parser needs the actual token code. + +""" + +# Python imports +import token, tokenize +import cPickle as pickle + +class Grammar(object): + """Pgen parsing tables tables conversion class. + + Once initialized, this class supplies the grammar tables for the + parsing engine implemented by parse.py. The parsing engine + accesses the instance variables directly. The class here does not + provide initialization of the tables; several subclasses exist to + do this (see the conv and pgen modules). + + The load() method reads the tables from a pickle file, which is + much faster than the other ways offered by subclasses. The pickle + file is written by calling dump() (after loading the grammar + tables using a subclass). The report() method prints a readable + representation of the tables to stdout, for debugging. + + The instance variables are as follows: + + symbol2number -- a dict mapping symbol names to numbers. Symbol + numbers are always 256 or higher, to distinguish + them from token numbers, which are between 0 and + 255 (inclusive). + + number2symbol -- a dict mapping numbers to symbol names; + these two are each other's inverse. + + states -- a list of DFAs, where each DFA is a list of + states, each state is is a list of arcs, and each + arc is a (i, j) pair where i is a label and j is + a state number. The DFA number is the index into + this list. (This name is slightly confusing.) + Final states are represented by a special arc of + the form (0, j) where j is its own state number. + + dfas -- a dict mapping symbol numbers to (DFA, first) + pairs, where DFA is an item from the states list + above, and first is a set of tokens that can + begin this grammar rule (represented by a dict + whose values are always 1). + + labels -- a list of (x, y) pairs where x is either a token + number or a symbol number, and y is either None + or a string; the strings are keywords. The label + number is the index in this list; label numbers + are used to mark state transitions (arcs) in the + DFAs. + + start -- the number of the grammar's start symbol. + + keywords -- a dict mapping keyword strings to arc labels. + + tokens -- a dict mapping token numbers to arc labels. + + """ + + def __init__(self): + self.symbol2number = {} + self.number2symbol = {} + self.states = [] + self.dfas = {} + self.labels = [(0, "EMPTY")] + self.keywords = {} + self.tokens = {} + self.symbol2label = {} + self.start = 256 + + def dump(self, filename): + """Dump the grammar tables to a pickle file.""" + f = open(filename, "wb") + pickle.dump(self.__dict__, f, 2) + f.close() + + def load(self, filename): + """Load the grammar tables from a pickle file.""" + f = open(filename, "rb") + d = pickle.load(f) + f.close() + self.__dict__.update(d) + + def report(self): + """Dump the grammar tables to standard output, for debugging.""" + from pprint import pprint + print "s2n" + pprint(self.symbol2number) + print "n2s" + pprint(self.number2symbol) + print "states" + pprint(self.states) + print "dfas" + pprint(self.dfas) + print "labels" + pprint(self.labels) + print "start", self.start + + +# Map from operator to number (since tokenize doesn't do this) + +opmap_raw = """ +( LPAR +) RPAR +[ LSQB +] RSQB +: COLON +, COMMA +; SEMI ++ PLUS +- MINUS +* STAR +/ SLASH +| VBAR +& AMPER +< LESS +> GREATER += EQUAL +. DOT +% PERCENT +` BACKQUOTE +{ LBRACE +} RBRACE +@ AT +== EQEQUAL +!= NOTEQUAL +<> NOTEQUAL +<= LESSEQUAL +>= GREATEREQUAL +~ TILDE +^ CIRCUMFLEX +<< LEFTSHIFT +>> RIGHTSHIFT +** DOUBLESTAR ++= PLUSEQUAL +-= MINEQUAL +*= STAREQUAL +/= SLASHEQUAL +%= PERCENTEQUAL +&= AMPEREQUAL +|= VBAREQUAL +^= CIRCUMFLEXEQUAL +<<= LEFTSHIFTEQUAL +>>= RIGHTSHIFTEQUAL +**= DOUBLESTAREQUAL +// DOUBLESLASH +//= DOUBLESLASHEQUAL +""" + +opmap = {} +for line in opmap_raw.splitlines(): + if line: + op, name = line.split() + opmap[op] = getattr(token, name) Added: sandbox/trunk/2to3/pgen2/literals.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/literals.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,60 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Safely evaluate Python string literals without using eval().""" + +import re + +simple_escapes = {"a": "\a", + "b": "\b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + "v": "\v", + "'": "'", + '"': '"', + "\\": "\\"} + +def escape(m): + all, tail = m.group(0, 1) + assert all.startswith("\\") + esc = simple_escapes.get(tail) + if esc is not None: + return esc + if tail.startswith("x"): + hexes = tail[1:] + if len(hexes) < 2: + raise ValueError("invalid hex string escape ('\\%s')" % tail) + try: + i = int(hexes, 16) + except ValueError: + raise ValueError("invalid hex string escape ('\\%s')" % tail) + else: + try: + i = int(tail, 8) + except ValueError: + raise ValueError("invalid octal string escape ('\\%s')" % tail) + return chr(i) + +def evalString(s): + assert s.startswith("'") or s.startswith('"'), repr(s[:1]) + q = s[0] + if s[:3] == q*3: + q = q*3 + assert s.endswith(q), repr(s[-len(q):]) + assert len(s) >= 2*len(q) + s = s[len(q):-len(q)] + return re.sub(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})", escape, s) + +def test(): + for i in range(256): + c = chr(i) + s = repr(c) + e = evalString(s) + if e != c: + print i, c, s, e + + +if __name__ == "__main__": + test() Added: sandbox/trunk/2to3/pgen2/parse.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/parse.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,199 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Parser engine for the grammar tables generated by pgen. + +The grammar table must be loaded first. + +See Parser/parser.c in the Python distribution for additional info on +how this parsing engine works. + +""" + +# Python imports +import token + +class ParseError(Exception): + """Exception to signal the parser is stuck.""" + + def __init__(self, msg, type, value, context): + Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % + (msg, type, value, context)) + self.msg = msg + self.type = type + self.value = value + self.context = context + +class Parser(object): + """Parser engine. + + The proper usage sequence is: + + p = Parser(grammar, [converter]) # create instance + p.setup([start]) # prepare for parsing + <for each input token>: + if p.addtoken(...): # parse a token; may raise ParseError + break + root = p.rootnode # root of abstract syntax tree + + A Parser instance may be reused by calling setup() repeatedly. + + A Parser instance contains state pertaining to the current token + sequence, and should not be used concurrently by different threads + to parse separate token sequences. + + See driver.py for how to get input tokens by tokenizing a file or + string. + + Parsing is complete when addtoken() returns True; the root of the + abstract syntax tree can then be retrieved from the rootnode + instance variable. When a syntax error occurs, addtoken() raises + the ParseError exception. There is no error recovery; the parser + cannot be used after a syntax error was reported (but it can be + reinitialized by calling setup()). + + """ + + def __init__(self, grammar, convert=None): + """Constructor. + + The grammar argument is a grammar.Grammar instance; see the + grammar module for more information. + + The parser is not ready yet for parsing; you must call the + setup() method to get it started. + + The optional convert argument is a function mapping concrete + syntax tree nodes to abstract syntax tree nodes. If not + given, no conversion is done and the syntax tree produced is + the concrete syntax tree. If given, it must be a function of + two arguments, the first being the grammar (a grammar.Grammar + instance), and the second being the concrete syntax tree node + to be converted. The syntax tree is converted from the bottom + up. + + A concrete syntax tree node is a (type, value, context, nodes) + tuple, where type is the node type (a token or symbol number), + value is None for symbols and a string for tokens, context is + None or an opaque value used for error reporting (typically a + (lineno, offset) pair), and nodes is a list of children for + symbols, and None for tokens. + + An abstract syntax tree node may be anything; this is entirely + up to the converter function. For example, it can be an + instance of a subclass of the astnode.Node class (see the + astnode module). + + """ + self.grammar = grammar + self.convert = convert or (lambda grammar, node: node) + + def setup(self, start=None): + """Prepare for parsing. + + This *must* be called before starting to parse. + + The optional argument is an alternative start symbol; it + defaults to the grammar's start symbol. + + You can use a Parser instance to parse any number of programs; + each time you call setup() the parser is reset to an initial + state determined by the (implicit or explicit) start symbol. + + """ + if start is None: + start = self.grammar.start + # Each stack entry is a tuple: (dfa, state, node). + # A node is a tuple: (type, value, context, children), + # where children is a list of nodes or None, and context may be None. + newnode = (start, None, None, []) + stackentry = (self.grammar.dfas[start], 0, newnode) + self.stack = [stackentry] + self.rootnode = None + + def addtoken(self, type, value, context): + """Add a token; return True iff this is the end of the program.""" + # Map from token to label + ilabel = self.classify(type, value, context) + # Loop until the token is shifted; may raise exceptions + while True: + dfa, state, node = self.stack[-1] + states, first = dfa + arcs = states[state] + # Look for a state with this label + for i, newstate in arcs: + t, v = self.grammar.labels[i] + if ilabel == i: + # Look it up in the list of labels + assert t < 256 + # Shift a token; we're done with it + self.shift(type, value, newstate, context) + # Pop while we are in an accept-only state + state = newstate + while states[state] == [(0, state)]: + self.pop() + if not self.stack: + # Done parsing! + return True + dfa, state, node = self.stack[-1] + states, first = dfa + # Done with this token + return False + elif t >= 256: + # See if it's a symbol and if we're in its first set + itsdfa = self.grammar.dfas[t] + itsstates, itsfirst = itsdfa + if ilabel in itsfirst: + # Push a symbol + self.push(t, self.grammar.dfas[t], newstate, context) + break # To continue the outer while loop + else: + if (0, state) in arcs: + # An accepting state, pop it and try something else + self.pop() + if not self.stack: + # Done parsing, but another token is input + raise ParseError("too much input", + type, value, context) + else: + # No success finding a transition + raise ParseError("bad input", type, value, context) + + def classify(self, type, value, context): + """Turn a token into a label. (Internal)""" + if type == token.NAME: + # Check for reserved words + ilabel = self.grammar.keywords.get(value) + if ilabel is not None: + return ilabel + ilabel = self.grammar.tokens.get(type) + if ilabel is None: + raise ParseError("bad token", type, value, context) + return ilabel + + def shift(self, type, value, newstate, context): + """Shift a token. (Internal)""" + dfa, state, node = self.stack[-1] + newnode = (type, value, context, None) + newnode = self.convert(self.grammar, newnode) + if newnode is not None: + node[-1].append(newnode) + self.stack[-1] = (dfa, newstate, node) + + def push(self, type, newdfa, newstate, context): + """Push a nonterminal. (Internal)""" + dfa, state, node = self.stack[-1] + newnode = (type, None, context, []) + self.stack[-1] = (dfa, newstate, node) + self.stack.append((newdfa, 0, newnode)) + + def pop(self): + """Pop a nonterminal. (Internal)""" + popdfa, popstate, popnode = self.stack.pop() + newnode = self.convert(self.grammar, popnode) + if newnode is not None: + if self.stack: + dfa, state, node = self.stack[-1] + node[-1].append(newnode) + else: + self.rootnode = newnode Added: sandbox/trunk/2to3/pgen2/pgen.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/pgen.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,384 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Python imports +import token +import tokenize + +# Pgen imports +from pgen2 import grammar + +class PgenGrammar(grammar.Grammar): + pass + +class ParserGenerator(object): + + def __init__(self, filename, stream=None): + if stream is None: + stream = open(filename) + self.filename = filename + self.stream = stream + self.generator = tokenize.generate_tokens(stream.readline) + self.gettoken() # Initialize lookahead + self.dfas, self.startsymbol = self.parse() + self.first = {} # map from symbol name to set of tokens + self.addfirstsets() + + def make_grammar(self): + c = PgenGrammar() + names = self.dfas.keys() + names.sort() + names.remove(self.startsymbol) + names.insert(0, self.startsymbol) + for name in names: + i = 256 + len(c.symbol2number) + c.symbol2number[name] = i + c.number2symbol[i] = name + for name in names: + dfa = self.dfas[name] + states = [] + for state in dfa: + arcs = [] + for label, next in state.arcs.iteritems(): + arcs.append((self.make_label(c, label), dfa.index(next))) + if state.isfinal: + arcs.append((0, dfa.index(state))) + states.append(arcs) + c.states.append(states) + c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) + c.start = c.symbol2number[self.startsymbol] + return c + + def make_first(self, c, name): + rawfirst = self.first[name] + first = {} + for label in rawfirst: + ilabel = self.make_label(c, label) + ##assert ilabel not in first # XXX failed on <> ... != + first[ilabel] = 1 + return first + + def make_label(self, c, label): + # XXX Maybe this should be a method on a subclass of converter? + ilabel = len(c.labels) + if label[0].isalpha(): + # Either a symbol name or a named token + if label in c.symbol2number: + # A symbol name (a non-terminal) + if label in c.symbol2label: + return c.symbol2label[label] + else: + c.labels.append((c.symbol2number[label], None)) + c.symbol2label[label] = ilabel + return ilabel + else: + # A named token (NAME, NUMBER, STRING) + itoken = getattr(token, label, None) + assert isinstance(itoken, int), label + assert itoken in token.tok_name, label + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + else: + # Either a keyword or an operator + assert label[0] in ('"', "'"), label + value = eval(label) + if value[0].isalpha(): + # A keyword + if value in c.keywords: + return c.keywords[value] + else: + c.labels.append((token.NAME, value)) + c.keywords[value] = ilabel + return ilabel + else: + # An operator (any non-numeric token) + itoken = grammar.opmap[value] # Fails if unknown token + if itoken in c.tokens: + return c.tokens[itoken] + else: + c.labels.append((itoken, None)) + c.tokens[itoken] = ilabel + return ilabel + + def addfirstsets(self): + names = self.dfas.keys() + names.sort() + for name in names: + if name not in self.first: + self.calcfirst(name) + #print name, self.first[name].keys() + + def calcfirst(self, name): + dfa = self.dfas[name] + self.first[name] = None # dummy to detect left recursion + state = dfa[0] + totalset = {} + overlapcheck = {} + for label, next in state.arcs.iteritems(): + if label in self.dfas: + if label in self.first: + fset = self.first[label] + if fset is None: + raise ValueError("recursion for rule %r" % name) + else: + self.calcfirst(label) + fset = self.first[label] + totalset.update(fset) + overlapcheck[label] = fset + else: + totalset[label] = 1 + overlapcheck[label] = {label: 1} + inverse = {} + for label, itsfirst in overlapcheck.iteritems(): + for symbol in itsfirst: + if symbol in inverse: + raise ValueError("rule %s is ambiguous; %s is in the" + " first sets of %s as well as %s" % + (name, symbol, label, inverse[symbol])) + inverse[symbol] = label + self.first[name] = totalset + + def parse(self): + dfas = {} + startsymbol = None + # MSTART: (NEWLINE | RULE)* ENDMARKER + while self.type != token.ENDMARKER: + while self.type == token.NEWLINE: + self.gettoken() + # RULE: NAME ':' RHS NEWLINE + name = self.expect(token.NAME) + self.expect(token.OP, ":") + a, z = self.parse_rhs() + self.expect(token.NEWLINE) + #self.dump_nfa(name, a, z) + dfa = self.make_dfa(a, z) + #self.dump_dfa(name, dfa) + oldlen = len(dfa) + self.simplify_dfa(dfa) + newlen = len(dfa) + dfas[name] = dfa + #print name, oldlen, newlen + if startsymbol is None: + startsymbol = name + return dfas, startsymbol + + def make_dfa(self, start, finish): + # To turn an NFA into a DFA, we define the states of the DFA + # to correspond to *sets* of states of the NFA. Then do some + # state reduction. Let's represent sets as dicts with 1 for + # values. + assert isinstance(start, NFAState) + assert isinstance(finish, NFAState) + def closure(state): + base = {} + addclosure(state, base) + return base + def addclosure(state, base): + assert isinstance(state, NFAState) + if state in base: + return + base[state] = 1 + for label, next in state.arcs: + if label is None: + addclosure(next, base) + states = [DFAState(closure(start), finish)] + for state in states: # NB states grows while we're iterating + arcs = {} + for nfastate in state.nfaset: + for label, next in nfastate.arcs: + if label is not None: + addclosure(next, arcs.setdefault(label, {})) + for label, nfaset in arcs.iteritems(): + for st in states: + if st.nfaset == nfaset: + break + else: + st = DFAState(nfaset, finish) + states.append(st) + state.addarc(st, label) + return states # List of DFAState instances; first one is start + + def dump_nfa(self, name, start, finish): + print "Dump of NFA for", name + todo = [start] + for i, state in enumerate(todo): + print " State", i, state is finish and "(final)" or "" + for label, next in state.arcs: + if next in todo: + j = todo.index(next) + else: + j = len(todo) + todo.append(next) + if label is None: + print " -> %d" % j + else: + print " %s -> %d" % (label, j) + + def dump_dfa(self, name, dfa): + print "Dump of DFA for", name + for i, state in enumerate(dfa): + print " State", i, state.isfinal and "(final)" or "" + for label, next in state.arcs.iteritems(): + print " %s -> %d" % (label, dfa.index(next)) + + def simplify_dfa(self, dfa): + # This is not theoretically optimal, but works well enough. + # Algorithm: repeatedly look for two states that have the same + # set of arcs (same labels pointing to the same nodes) and + # unify them, until things stop changing. + + # dfa is a list of DFAState instances + changes = True + while changes: + changes = False + for i, state_i in enumerate(dfa): + for j in range(i+1, len(dfa)): + state_j = dfa[j] + if state_i == state_j: + #print " unify", i, j + del dfa[j] + for state in dfa: + state.unifystate(state_j, state_i) + changes = True + break + + def parse_rhs(self): + # RHS: ALT ('|' ALT)* + a, z = self.parse_alt() + if self.value != "|": + return a, z + else: + aa = NFAState() + zz = NFAState() + aa.addarc(a) + z.addarc(zz) + while self.value == "|": + self.gettoken() + a, z = self.parse_alt() + aa.addarc(a) + z.addarc(zz) + return aa, zz + + def parse_alt(self): + # ALT: ITEM+ + a, b = self.parse_item() + while (self.value in ("(", "[") or + self.type in (token.NAME, token.STRING)): + c, d = self.parse_item() + b.addarc(c) + b = d + return a, b + + def parse_item(self): + # ITEM: '[' RHS ']' | ATOM ['+' | '*'] + if self.value == "[": + self.gettoken() + a, z = self.parse_rhs() + self.expect(token.OP, "]") + a.addarc(z) + return a, z + else: + a, z = self.parse_atom() + value = self.value + if value not in ("+", "*"): + return a, z + self.gettoken() + z.addarc(a) + if value == "+": + return a, z + else: + return a, a + + def parse_atom(self): + # ATOM: '(' RHS ')' | NAME | STRING + if self.value == "(": + self.gettoken() + a, z = self.parse_rhs() + self.expect(token.OP, ")") + return a, z + elif self.type in (token.NAME, token.STRING): + a = NFAState() + z = NFAState() + a.addarc(z, self.value) + self.gettoken() + return a, z + else: + self.raise_error("expected (...) or NAME or STRING, got %s/%s", + self.type, self.value) + + def expect(self, type, value=None): + if self.type != type or (value is not None and self.value != value): + self.raise_error("expected %s/%s, got %s/%s", + type, value, self.type, self.value) + value = self.value + self.gettoken() + return value + + def gettoken(self): + tup = self.generator.next() + while tup[0] in (tokenize.COMMENT, tokenize.NL): + tup = self.generator.next() + self.type, self.value, self.begin, self.end, self.line = tup + #print token.tok_name[self.type], repr(self.value) + + def raise_error(self, msg, *args): + if args: + try: + msg = msg % args + except: + msg = " ".join([msg] + map(str, args)) + raise SyntaxError(msg, (self.filename, self.end[0], + self.end[1], self.line)) + +class NFAState(object): + + def __init__(self): + self.arcs = [] # list of (label, NFAState) pairs + + def addarc(self, next, label=None): + assert label is None or isinstance(label, str) + assert isinstance(next, NFAState) + self.arcs.append((label, next)) + +class DFAState(object): + + def __init__(self, nfaset, final): + assert isinstance(nfaset, dict) + assert isinstance(iter(nfaset).next(), NFAState) + assert isinstance(final, NFAState) + self.nfaset = nfaset + self.isfinal = final in nfaset + self.arcs = {} # map from label to DFAState + + def addarc(self, next, label): + assert isinstance(label, str) + assert label not in self.arcs + assert isinstance(next, DFAState) + self.arcs[label] = next + + def unifystate(self, old, new): + for label, next in self.arcs.iteritems(): + if next is old: + self.arcs[label] = new + + def __eq__(self, other): + # Equality test -- ignore the nfaset instance variable + assert isinstance(other, DFAState) + if self.isfinal != other.isfinal: + return False + # Can't just return self.arcs == other.arcs, because that + # would invoke this method recursively, with cycles... + if len(self.arcs) != len(other.arcs): + return False + for label, next in self.arcs.iteritems(): + if next is not other.arcs.get(label): + return False + return True + +def generate_grammar(filename="Grammar.txt"): + p = ParserGenerator(filename) + return p.make_grammar() Added: sandbox/trunk/2to3/pgen2/python.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/python.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,9 @@ +#!/usr/bin/python2.5 +# Copyright 2006 Python Software Foundation. All rights reserved. + +"""Full Python grammar""" + +from __future__ import with_statement + +__author__ = "Guido van Rossum <guido@python.org>" + Added: sandbox/trunk/2to3/pgen2/test.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pgen2/test.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,18 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +def test(): + import sys + sys.path[0] = ".." + from pgen2 import astnode, driver + f = open("Grammar.txt", "w") + try: + astnode.generate_grammar(f) + finally: + f.close() + sample = "year<=1989 ? ('Modula-3' + ABC) ** 2 : Python" + tree = driver.parse_string(sample, True) + print tree + +if __name__ == "__main__": + test() Added: sandbox/trunk/2to3/play.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/play.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,66 @@ +#!/usr/bin/env python2.5 +# Copyright 2006 Python Software Foundation. All Rights Reserved. + +"""XXX.""" + +from __future__ import with_statement + +__author__ = "Guido van Rossum <guido@python.org>" + +# Python imports +import os +import sys +import logging + +import pgen2 +from pgen2 import driver + +import pynode + +logging.basicConfig(level=logging.INFO) + +SAMPLE = """\ +for i in range(10): + print i +class C(object, list): + def f(a, b, c=1, *args, **kwds): + pass +""" + +def main(): + gr = driver.load_grammar("Grammar.txt") + dr = driver.Driver(gr, convert=pynode.convert) + +## tree = dr.parse_file("pynode.py", debug=True) +## print tree + +## for name in sys.modules: +## mod = sys.modules[name] +## if mod is None or not hasattr(mod, "__file__"): +## continue +## fn = mod.__file__ +## if fn.endswith(".pyc"): +## fn = fn[:-1] +## if not fn.endswith(".py"): +## continue +## print >>sys.stderr, "Parsing", fn +## dr.parse_file(fn, debug=True) + + for dir in reversed(sys.path): + try: + names = os.listdir(dir) + except os.error: + continue + print >>sys.stderr, "Scanning", dir, "..." + for name in names: + if not name.endswith(".py"): + continue + print >>sys.stderr, "Parsing", name + try: + dr.parse_file(os.path.join(dir, name), debug=True) + except pgen2.parse.ParseError, err: + print "ParseError:", err + + +if __name__ == "__main__": + main() Added: sandbox/trunk/2to3/pynode.py ============================================================================== --- (empty file) +++ sandbox/trunk/2to3/pynode.py Wed Nov 29 18:38:40 2006 @@ -0,0 +1,698 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +# Modifications: +# Copyright 2006 Python Software Foundation. All Rights Reserved. + +"""Syntax tree node definitions. + +There is a class or function corresponding to each terminal and +nonterminal symbol. + +The grammar is pieced together from the docstrings of the +corresponding classes and functions: text between dollar signs is +assumed to be the right-hand side of the grammar rule corresponding to +that class or function. + +We use __slots__ to make the parse tree nodes as small as possible. + +NOTE: EVERY CLASS MUST HAVE A __slots__ DEFINITION, EVEN IF EMPTY! +(If not, the instances will get a __dict__, defeating the purpose of +__slots__ in our case.) +""" + +__author__ = "Guido van Rossum <guido@python.org>" + +# Python imports +import token +import logging + +# Pgen imports +from pgen2 import grammar + +# Custom logger +logger = logging.getLogger() + +class Node(object): + """Abstract base class for all nodes. + + This has no attributes except a context slot which holds the line + number (or more detailed context info). In the future this might + change this to several slots (e.g. filename, lineno, column, or + even filename, start_lineno, start_column, end_lineno, + end_column). The context is only referenced by two places: the + part of the code that sticks it in, and the part of the code that + reports errors. + + In order to reduce the amount of boilerplate code, the context is + argument is handled by __new__ rather than __init__. There are + also a few subclasses that override __new__ to sometimes avoid + constructing an instance. + + """ + + __slots__ = ["context"] + + def __new__(cls, context, *rest): + assert cls not in (Node, Nonterminal, Terminal, Constant) + obj = object.__new__(cls) + obj.context = context + return obj + + _stretch = False # Set to true to stretch the repr() vertically + + def __repr__(self, repr=repr): + stretch = self._stretch + r = [self.__class__.__name__] + if stretch: + r.append("(\n ") + else: + r.append("(") # ")" -- crutch for Emacs python-mode :-( + cls = self.__class__ + # Get nearest non-empty slots __slots__. This assumes + # *something* has non-empty __slots__ before we reach object + # (which has no __slots__). The class hierarchy guarantees + # this. + slots = cls.__slots__ + while not slots: + cls = cls.__base__ + slots = cls.__slots__ + first = True + for name in slots: + if name == "context": + continue # Skip this + if first: + first = False + else: + if stretch: + r.append(",\n ") + else: + r.append(", ") + try: + value = getattr(self, name) + except AttributeError: + continue + if stretch and isinstance(value, list): + rr = map(repr, value) + rv = "[" + ",\n ".join(rr) + "]" + else: + rv = repr(value) + if stretch: + rv = rv.replace("\n", "\n ") + r.append(rv) + r.append(")") + return "".join(r) + + def __str__(self): + return self.__repr__(repr=str) + +class Nonterminal(Node): + """Abstract base class for nonterminal symbols. + + Nothing beyond Node. + + """ + + __slots__ = [] + + _stretch = True + +class Terminal(Node): + """Abstract base class for terminal symbols. + + Nothing beyond Node. + + """ + + __slots__ = [] + +class Series(Nonterminal): + """Abstract base class for nonterminals like stmts: stmt+.""" + + __slots__ = [] + + def __new__(cls, context, *nodes): + assert cls is not Series + if len(nodes) == 0: + return None + elif len(nodes) == 1: + return nodes[0] + else: + obj = Nonterminal.__new__(cls, context) + obj.initseries(nodes) + return obj + +class Constant(Terminal): + """Abstract base class for constants (e.g. number or string). + + Attributes: + + repr -- a string giving the token, exactly as read from source + + """ + + __slots__ = ["repr"] + + def __init__(self, context, repr): + self.repr = repr + + def __str__(self): + return self.repr + +# Node classes for terminal symbols + +class Name(Terminal): + """Name (e.g. a variable name or an attribute name). + + Attributes: + + name -- a string giving the name. + + """ + + __slots__ = ["name"] + + def __init__(self, context, name): + self.name = name + + def __str__(self): + return self.name + +class Number(Constant): + """Numeric constant. + + Nothing beyond Constant. + + """ + + __slots__ = [] + +class String(Constant): + """String constant. + + Nothing beyond Constant. + + """ + + __slots__ = [] + +# Nodes and factory functions for Python grammar + +class VanishingNonterminal(Nonterminal): + __slots__ = [] + def __new__(cls, context, node): + return node + +class GenericSeries(Series): + __slots__ = ["nodes"] + def initseries(self, nodes): + self.nodes = nodes + +class atom(GenericSeries): + __slots__ = [] + +class power(GenericSeries): + __slots__ = [] + +class factor(GenericSeries): + __slots__ = [] + +class term(GenericSeries): + __slots__ = [] + +class arith_expr(GenericSeries): + __slots__ = [] + +class shift_expr(GenericSeries): + __slots__ = [] + +class and_expr(GenericSeries): + __slots__ = [] + +class xor_expr(GenericSeries): + __slots__ = [] + +class or_expr(GenericSeries): + __slots__ = [] + +class expr(GenericSeries): + __slots__ = [] + +class comparison(GenericSeries): + __slots__ = [] + +class not_test(GenericSeries): + __slots__ = [] + +class and_test(GenericSeries): + __slots__ = [] + +class or_test(GenericSeries): + __slots__ = [] + +class test(GenericSeries): + __slots__ = [] + +class testlist(GenericSeries): + __slots__ = [] + +class expr_stmt(GenericSeries): + __slots__ = [] + +class trailer(GenericSeries): + __slots__ = [] + +class argument(GenericSeries): + __slots__ = [] + +class arglist(GenericSeries): + __slots__ = [] + +class subscript(GenericSeries): + __slots__ = [] + +class subscriptlist(GenericSeries): + __slots__ = [] + +class listmaker(GenericSeries): + __slots__ = [] + +class testlist_gexp(GenericSeries): + __slots__ = [] + +class suite(GenericSeries): + __slots__ = [] + +class if_stmt(GenericSeries): + __slots__ = [] + +class compound_stmt(GenericSeries): + __slots__ = [] + +class parameters(GenericSeries): + __slots__ = [] + +class funcdef(GenericSeries): + __slots__ = [] + +class fpdef(GenericSeries): + __slots__ = [] + +class varargslist(GenericSeries): + __slots__ = [] + +class classdef(GenericSeries): + __slots__ = [] + +class exprlist(GenericSeries): + __slots__ = [] + +class print_stmt(GenericSeries): + __slots__ = [] + +class for_stmt(GenericSeries): + __slots__ = [] + +class dotted_name(GenericSeries): + __slots__ = [] + +class dotted_as_name(GenericSeries): + __slots__ = [] + +class dotted_as_names(GenericSeries): + __slots__ = [] + +class import_as_names(GenericSeries): + __slots__ = [] + +class import_as_name(GenericSeries): + __slots__ = [] + +class import_name(GenericSeries): + __slots__ = [] + +class import_from(GenericSeries): + __slots__ = [] + +class import_stmt(GenericSeries): + __slots__ = [] + +class comp_op(GenericSeries): + __slots__ = [] + +class assert_stmt(GenericSeries): + __slots__ = [] + +class return_stmt(GenericSeries): + __slots__ = [] + +class continue_stmt(GenericSeries): + __slots__ = [] + +class break_stmt(GenericSeries): + __slots__ = [] + +class flow_stmt(GenericSeries): + __slots__ = [] + +class while_stmt(GenericSeries): + __slots__ = [] + +class except_clause(GenericSeries): + __slots__ = [] + +class try_stmt(GenericSeries): + __slots__ = [] + +class dictmaker(GenericSeries): + __slots__ = [] + +class raise_stmt(GenericSeries): + __slots__ = [] + +class del_stmt(GenericSeries): + __slots__ = [] + +class exec_stmt(GenericSeries): + __slots__ = [] + +class augassign(GenericSeries): + __slots__ = [] + +class global_stmt(GenericSeries): + __slots__ = [] + +class fplist(GenericSeries): + __slots__ = [] + +class lambdef(GenericSeries): + __slots__ = [] + +class old_test(GenericSeries): + __slots__ = [] + +class testlist_safe(GenericSeries): + __slots__ = [] + +class list_for(GenericSeries): + __slots__ = [] + +class decorator(GenericSeries): + __slots__ = [] + +class decorators(GenericSeries): + __slots__ = [] + +class yield_expr(GenericSeries): + __slots__ = [] + +class yield_stmt(GenericSeries): + __slots__ = [] + +class list_if(GenericSeries): + __slots__ = [] + +class list_iter(GenericSeries): + __slots__ = [] + +class gen_for(GenericSeries): + __slots__ = [] + +class gen_iter(GenericSeries): + __slots__ = [] + +class gen_if(GenericSeries): + __slots__ = [] + +class with_var(GenericSeries): + __slots__ = [] + +class with_stmt(GenericSeries): + __slots__ = [] + +class sliceop(GenericSeries): + __slots__ = [] + +class testlist1(GenericSeries): + __slots__ = [] + + +def _transparent(context, node, *rest): + assert rest == (), (context, node, rest) + return node + +pass_stmt = _transparent +small_stmt = _transparent +stmt = _transparent + +class simple_stmt(GenericSeries): + __slots__ = [] + +class file_input(GenericSeries): + __slots__ = [] + + +# Example nodes and factory functions for nonterminal symbols +# XXX: get rid of these + +def Program(context, stmts): + """Program is a nonterminal with only one non-trivial child. + + Grammar: $ Expression ENDMARKER $ + + """ + return stmts + +class Expression(Series): + "Grammar: $ BinaryExpression ['?' BinaryExpression ':' BinaryExpression] $" + + __slots__ = ["test", "left", "right"] + + def initseries(self, nodes): + self.test, self.left, self.right = nodes + + def __str__(self): + return "%s ? %s : %s" % (self.test, self.left, self.right) + +class BinaryExpression(Series): + "Grammar: $ Operand (Operator Operand)* $" + + __slots__ = ["left", "op", "right"] + + def initseries(self, stuff): + # Stuff is a list with alternating operands and operators + if len(stuff) == 3: + self.left, self.op, self.right = stuff + return + assert len(stuff) > 1 and len(stuff) % 2 == 1 + # Find the rightmost lowest-priority operator + lowest_i = 1 + lowest_op = stuff[lowest_i] + lowest_pri = lowest_op.priority() + for i in range(3, len(stuff), 2): + op = stuff[i] + pri = op.priority() + if pri <= lowest_pri: + lowest_i = i + lowest_op = op + lowest_pri = pri + self.left = self.__class__(self.context, *stuff[:lowest_i]) + self.op = lowest_op + self.right = self.__class__(self.context, *stuff[lowest_i+1:]) + + def __str__(self): + return "(%s %s %s)" % (self.left, self.op, self.right) + +def Operand(context, arg): + """Operand is a nonterminal with one child. + + Grammar: $ Atom | UnaryExpression $ + + """ + return arg + +class UnaryExpression(Nonterminal): + "Grammar: $ UnaryOperator Operand $" + + __slots__ = ["op", "arg"] + + def __init__(self, context, op, arg): + self.op = op + self.arg = arg + + def __str__(self): + return "%s%s" % (self.op, self.arg) + +def UnaryOperator(context, op): + "Grammar: $ 'not' | '-' $" + return op + +class Operator(Nonterminal): + """Operator. + + This has a repr slot and a priority() method. + + The __new__ method implements a sort-of singleton pattern: there's + only one instance per repr value. (Yes, this means the context is + not useful. Therefore we set it to None.) + + Grammar: $ '**' | '*' | '/' | '+' | '-' | '&' | '^' | '|' | + ['not'] 'in' | '==' | '<' | '>' | '!=' | '<=' | '>=' | + 'and' | 'or' $ + + """ + + __slots__ = ["repr"] + + _stretch = False + + _cache = {} + + def __new__(cls, context, *args): + repr = " ".join(args) + # For "not in", the argument should be the string "not in" + obj = cls._cache.get(repr) + if obj is None: + obj = Terminal.__new__(cls, None) + obj.repr = repr + return obj + + def __str__(self): + return self.repr + + def priority(self): + return self._priorities[self.repr] + + _priorities = { + "or": 0, + "and": 1, + "in": 2, + "not in": 2, + "<": 2, + ">": 2, + "==": 2, + "<=": 2, + ">=": 2, + "!=": 2, + "|": 3, + "^": 4, + "&": 5, + "+": 6, + "-": 6, + "*": 7, + "/": 7, + "**": 8, + } + +def Atom(context, arg): + """ Grammar: $ NAME | STRING | NUMBER | ParenthesizedExpression $ + """ + return arg + +def ParenthesizedExpression(context, expr): + "Grammar: $ '(' Expression ')' $" + return expr + + +# Conversion from concrete to abstract syntax trees + +def vanish(context, value): + return None + +token_mapping = { + # Tokens that become parse tree nodes + # (token.NAME is special-cased in the code) + token.NUMBER: Number, + token.STRING: String, + +## # Tokens that vanish +## token.DOT: vanish, +## token.LPAR: vanish, +## token.RPAR: vanish, +## token.COLON: vanish, +## token.COMMA: vanish, +## token.EQUAL: vanish, +## token.DEDENT: vanish, +## token.INDENT: vanish, +## token.LBRACE: vanish, +## token.RBRACE: vanish, +## token.NEWLINE: vanish, + token.ENDMARKER: vanish, +## grammar.QUESTIONMARK: vanish, + + # All other tokens return the token's string value (e.g. "+") + } + +vanishing_keywords = { + # E.g. "def": True, + } + +def convert(grammar, node): + type, value, context, children = node + # Is it a non-terminal symbol? + if type in grammar.number2symbol: + symbol = grammar.number2symbol[type] + factory = globals().get(symbol) + if factory is None: + raise RuntimeError("can't find factory for %s (line %s)" % + (symbol, context)) + # Debug variation: + try: + return factory(context, *children) + except: + logger.debug("%s %s", factory.__name__, "(") + for child in children: + logger.debug("%s %s", "==>", child) + logger.debug(")") + logger.debug("# Did you remember to declare a 'context' arg?") + raise + return factory(context, *children) + + # Must be a terminal symbol. + if type == token.NAME: + # Name or keyword. Special-case the snot out of this. + if value in grammar.keywords: + # Keywords become strings, like operators + if value in vanishing_keywords: + return None + else: + return value + else: + return Name(context, value) + + # Look for a handler in the token_mapping table. + assert type in token.tok_name + factory = token_mapping.get(type) + if factory: + return factory(context, value) + else: + return value + + +# Support code + +def generate_grammar(stream): + """Extract the grammar rules from this module's doc strings.""" + import re + from types import ModuleType + lines = [] + startline = None + for name, obj in globals().items(): + if hasattr(obj, "__doc__") and not isinstance(obj, ModuleType): + m = re.search(r"Grammar:\s*\$([^$]+)\$", obj.__doc__ or "") + if m: + rule = obj.__name__, " ".join(m.group(1).split()) + if rule[0] == "Program": + assert not startline + startline = rule + else: + lines.append(rule) + lines.sort() + # The start symbol *must* be the first rule + lines.insert(0, startline) + for name, rhs in lines: + stream.write("%s: %s\n" % (name, rhs)) + +if __name__ == '__main__': + import sys + generate_grammar(sys.stdout)
participants (1)
-
guido.van.rossum