[pypy-svn] r13358 - pypy/branch/pycompiler/module/recparser
adim at codespeak.net
adim at codespeak.net
Mon Jun 13 18:29:29 CEST 2005
Author: adim
Date: Mon Jun 13 18:29:28 2005
New Revision: 13358
Modified:
pypy/branch/pycompiler/module/recparser/pythonlexer.py
Log:
- removed the regexp-based parser (the automata-based one is fully compatible
and is now translatable)
- provided a dummy implementation for enconding decl recognition (don't
use regexp anymore)
- tidy up
Modified: pypy/branch/pycompiler/module/recparser/pythonlexer.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/pythonlexer.py (original)
+++ pypy/branch/pycompiler/module/recparser/pythonlexer.py Mon Jun 13 18:29:28 2005
@@ -4,47 +4,44 @@
"""
from grammar import TokenSource, Token
+# Don't import string for that ...
+NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
+NUMCHARS = '0123456789'
+ALNUMCHARS = NAMECHARS + NUMCHARS
+EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'
+WHITESPACES = ' \t\n\r\v\f'
+
+def match_encoding_declaration(comment):
+ """returns the declared encoding or None
+
+ This function is a replacement for :
+ >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
+ >>> py_encoding.search(comment)
+ """
+ index = comment.find('coding')
+ if index == -1:
+ return None
+ next_char = comment[index + 6]
+ if next_char not in ':=':
+ return None
+ end_of_decl = comment[index + 7:]
+ index = 0
+ for char in end_of_decl:
+ if char not in WHITESPACES:
+ break
+ index += 1
+ else:
+ return None
+ encoding = ''
+ for char in end_of_decl[index:]:
+ if char in EXTENDED_ALNUMCHARS:
+ encoding += char
+ else:
+ break
+ if encoding != '':
+ return encoding
+ return None
-DEBUG = False
-import re
-
-KEYWORDS = [
- 'and', 'assert', 'break', 'class', 'continue', 'def', 'del',
- 'elif', 'if', 'import', 'in', 'is', 'finally', 'for', 'from',
- 'global', 'else', 'except', 'exec', 'lambda', 'not', 'or',
- 'pass', 'print', 'raise', 'return', 'try', 'while', 'yield'
- ]
-
-py_keywords = re.compile(r'(%s)$' % ('|'.join(KEYWORDS)), re.M | re.X)
-
-py_punct = re.compile(r"""
-<>|!=|==|~|
-<=|<<=|<<|<|
->=|>>=|>>|>|
-\*=|\*\*=|\*\*|\*|
-//=|/=|//|/|
-%=|\^=|\|=|\+=|=|&=|-=|
-,|\^|&|\+|-|\.|%|\||
-\)|\(|;|:|@|\[|\]|`|\{|\}
-""", re.M | re.X)
-
-g_symdef = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*:", re.M)
-g_string = re.compile(r"'[^']+'", re.M)
-py_name = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*", re.M)
-py_comment = re.compile(r"#.*$|[ \t\014]*$", re.M)
-py_ws = re.compile(r" *", re.M)
-py_skip = re.compile(r"[ \t\014]*(#.*$)?", re.M)
-py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
-# py_number = re.compile(r"0x[0-9a-z]+|[0-9]+l|([0-9]+\.[0-9]*|\.[0-9]+|[0-9]+)(e[+-]?[0-9]+)?j?||[0-9]+", re.I)
-
-# 0x[\da-f]+l matches hexadecimal numbers, possibly defined as long
-# \d+l matches and only matches long integers
-# (\d+\.\d*|\.\d+|\d+)(e[+-]?\d+)?j? matches simple integers,
-# exponential notations and complex
-py_number = re.compile(r"""0x[\da-f]+l?|
-\d+l|
-(\d+\.\d*|\.\d+|\d+)(e[+-]?\d+)?j?
-""", re.I | re.X)
def _normalize_encoding(encoding):
"""returns normalized name for <encoding>
@@ -64,251 +61,9 @@
return 'iso-8859-1'
return encoding
-class PythonSource(TokenSource):
- """The Python tokenizer"""
- def __init__(self, inpstring):
- TokenSource.__init__(self)
- self.input = inpstring
- self.pos = 0
- self.indent = 0
- self.indentstack = [ 0 ]
- self.atbol = True
- self.line = 1
- self._current_line = 1
- self.pendin = 0 # indentation change waiting to be reported
- self.level = 0
- self.linestart = 0
- self.stack = []
- self.stack_pos = 0
- self.comment = ''
- self.encoding = None
-
-
- def current_line(self):
- return self._current_line
-
- def context(self):
- return self.stack_pos
-
- def restore(self, ctx):
- self.stack_pos = ctx
-
- def offset(self, ctx=None):
- if ctx is None:
- return self.stack_pos
- else:
- assert type(ctx)==int
- return ctx
-
- def _next(self):
- """returns the next token from source"""
- inp = self.input
- pos = self.pos
- input_length = len(inp)
- if pos >= input_length:
- return self.end_of_file()
- # Beginning of line
- if self.atbol:
- self.linestart = pos
- col = 0
- m = py_ws.match(inp, pos)
- pos = m.end()
- col = pos - self.linestart
- self.atbol = False
- # skip blanklines
- m = py_comment.match(inp, pos)
- if m:
- if not self.comment:
- self.comment = m.group(0)
- # <HACK> XXX FIXME: encoding management
- if self.line <= 2:
- # self.comment can be the previous comment, so don't use it
- comment = m.group(0)[1:]
- m_enc = py_encoding.search(comment)
- if m_enc is not None:
- self.encoding = _normalize_encoding(m_enc.group(1))
- # </HACK>
- self.pos = m.end() + 1
- self.line += 1
- self.atbol = True
- return self._next()
- # the current block is more indented than the previous one
- if col > self.indentstack[-1]:
- self.indentstack.append(col)
- return "INDENT", None
- # the current block is less indented than the previous one
- while col < self.indentstack[-1]:
- self.pendin += 1
- self.indentstack.pop(-1)
- if col != self.indentstack[-1]:
- raise SyntaxError("Indentation Error")
- if self.pendin > 0:
- self.pendin -= 1
- return "DEDENT", None
- m = py_skip.match(inp, pos)
- if m.group(0)[-1:] == '\n':
- self.line += 1
- self.comment = m.group(1) or ''
- pos = m.end() # always match
- if pos >= input_length:
- return self.end_of_file()
- self.pos = pos
-
- # STRING
- c = inp[pos]
- if c in ('r','R'):
- if pos < input_length-1 and inp[pos+1] in ("'",'"'):
- return self.next_string(raw=1)
- elif c in ('u','U'):
- if pos < input_length-1:
- if inp[pos+1] in ("r",'R'):
- if pos<input_length-2 and inp[pos+2] in ("'",'"'):
- return self.next_string( raw = 1, uni = 1 )
- elif inp[pos+1] in ( "'", '"' ):
- return self.next_string( uni = 1 )
- elif c in ( '"', "'" ):
- return self.next_string()
-
- # NAME
- m = py_name.match(inp, pos)
- if m:
- self.pos = m.end()
- val = m.group(0)
-# if py_keywords.match(val):
-# return val, None
- return "NAME", val
-
- # NUMBER
- m = py_number.match(inp, pos)
- if m:
- self.pos = m.end()
- return "NUMBER", m.group(0)
-
- # NEWLINE
- if c == '\n':
- self.pos += 1
- self.line += 1
- if self.level > 0:
- return self._next()
- else:
- self.atbol = True
- comment = self.comment
- self.comment = ''
- return "NEWLINE", comment
-
- if c == '\\':
- if pos < input_length-1 and inp[pos+1] == '\n':
- self.pos += 2
- return self._next()
-
- m = py_punct.match(inp, pos)
- if m:
- punct = m.group(0)
- if punct in ( '(', '{', '[' ):
- self.level += 1
- if punct in ( ')', '}', ']' ):
- self.level -= 1
- self.pos = m.end()
- return punct, None
- raise SyntaxError("Unrecognized token '%s'" % inp[pos:pos+20] )
-
- def next(self):
- if self.stack_pos >= len(self.stack):
- pos0 = self.pos
- tok, val = self._next()
- token = Token( tok, val )
- self.stack.append( ( token, self.line, pos0) )
- self._current_line = self.line
- else:
- token, line, pos0 = self.stack[self.stack_pos]
- self._current_line = line
- self.stack_pos += 1
- if DEBUG:
- print "%d/%d: %s, %s" % (self.stack_pos, len(self.stack), tok, val)
- return token
-
- def get_pos(self):
- if self.stack_pos >= len(self.stack):
- return self.pos
- else:
- token, line, pos = self.stack[self.stack_pos]
- return pos
-
- def get_source_text(self, pos0, pos1 ):
- return self.input[pos0:pos1]
-
- def peek(self):
- """returns next token without consuming it"""
- ctx = self.context()
- token = self.next()
- self.restore(ctx)
- return token
-
-
- def end_of_file(self):
- """return DEDENT and ENDMARKER"""
- if len(self.indentstack) == 1:
- self.indentstack.pop(-1)
- return "NEWLINE", '' #self.comment
- elif len(self.indentstack) > 1:
- self.indentstack.pop(-1)
- return "DEDENT", None
- return "ENDMARKER", None
-
-
- def next_string(self, raw=0, uni=0):
- pos = self.pos + raw + uni
- inp = self.input
- quote = inp[pos]
- qsize = 1
- if inp[pos:pos+3] == 3*quote:
- pos += 3
- quote = 3*quote
- qsize = 3
- else:
- pos += 1
- while True:
- if inp[pos:pos+qsize] == quote:
- s = inp[self.pos:pos+qsize]
- self.pos = pos+qsize
- return "STRING", s
- # FIXME : shouldn't it be inp[pos] == os.linesep ?
- if inp[pos:pos+2] == "\n" and qsize == 1:
- return None, None
- if inp[pos] == "\\":
- pos += 1
- pos += 1
-
- def debug(self):
- """return context for debug information"""
- if not hasattr(self, '_lines'):
- # split lines only once
- self._lines = self.input.splitlines()
- if self.line > len(self._lines):
- lineno = len(self._lines)
- else:
- lineno = self.line
- return 'line %s : %s' % (lineno, self._lines[lineno-1])
-
-
################################################################################
-class StringAsFile(object):
- """XXX: Is StringIO RPython ?"""
-
- def __init__(self, inpstring):
- self.lines = inpstring.splitlines(True)
- self.lineno = 0
-
- def readline(self):
- if self.lineno < len(self.lines):
- line = self.lines[self.lineno]
- self.lineno += 1
- return line
- return ''
-
-
import token as tokenmod
-from pypy.module.parser.pytokenize import generate_tokens, tabsize, \
+from pypy.module.parser.pytokenize import tabsize, \
whiteSpaceDFA, triple_quoted, endDFAs, single_quoted, pseudoDFA
from pypy.module.parser import automata
@@ -347,8 +102,8 @@
"""
token_list = []
lnum = parenlev = continued = 0
- namechars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
- numchars = '0123456789'
+ namechars = NAMECHARS
+ numchars = NUMCHARS
contstr, needcont = '', 0
contline = None
indents = [0]
@@ -405,10 +160,10 @@
if line[pos] == '#':
tok = token_from_values(tokenmod.COMMENT, line[pos:])
last_comment = line[pos:]
- if lnum <= 2:
- m_enc = py_encoding.search(last_comment)
- if m_enc is not None:
- encoding = _normalize_encoding(m_enc.group(1))
+ if lnum <= 2 and encoding is None:
+ encoding = match_encoding_declaration(last_comment)
+ if encoding is not None:
+ encoding = _normalize_encoding(encoding)
else:
tok = token_from_values(tokenmod.NL, line[pos:])
last_comment = ''
@@ -468,10 +223,10 @@
elif initial == '#':
tok = token_from_values(tokenmod.COMMENT, token)
last_comment = token
- if lnum <= 2:
- m_enc = py_encoding.search(last_comment)
- if m_enc is not None:
- encoding = _normalize_encoding(m_enc.group(1))
+ if lnum <= 2 and encoding is None:
+ encoding = match_encoding_declaration(last_comment)
+ if encoding is not None:
+ encoding = _normalize_encoding(encoding)
# XXX Skip # token_list.append((tok, line))
# token_list.append((COMMENT, token, spos, epos, line))
elif token in triple_quoted:
@@ -540,12 +295,10 @@
# token_list.append((ENDMARKER, '', (lnum, 0), (lnum, 0), ''))
return token_list, encoding
-
-class PythonSource2(TokenSource):
+class PythonSource(TokenSource):
"""This source uses Jonathan's tokenizer"""
def __init__(self, inpstring):
TokenSource.__init__(self)
- # tokens, encoding = generate_tokens(StringAsFile(inpstring).readline)
tokens, encoding = generate_tokens(inpstring.splitlines(True))
self.token_stack = tokens
self.encoding = encoding
@@ -611,7 +364,7 @@
return Token('NEWLINE', '') # XXX pending comment ?
return Token(tokenmod.tok_name[tok_type], tok_string)
-Source = PythonSource2
+Source = PythonSource
def tokenize_file(filename):
f = file(filename).read()
More information about the Pypy-commit
mailing list