[pypy-svn] r13358 - pypy/branch/pycompiler/module/recparser

Mon Jun 13 18:29:29 CEST 2005

Author: adim
Date: Mon Jun 13 18:29:28 2005
New Revision: 13358

Modified:
   pypy/branch/pycompiler/module/recparser/pythonlexer.py
Log:
- removed the regexp-based parser (the automata-based one is fully compatible
  and is now translatable)
- provided a dummy implementation for enconding decl recognition (don't 
  use regexp anymore)
- tidy up



Modified: pypy/branch/pycompiler/module/recparser/pythonlexer.py
==============================================================================

--- pypy/branch/pycompiler/module/recparser/pythonlexer.py	(original)
+++ pypy/branch/pycompiler/module/recparser/pythonlexer.py	Mon Jun 13 18:29:28 2005
@@ -4,47 +4,44 @@
 """
 
 from grammar import TokenSource, Token
+# Don't import string for that ...
+NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
+NUMCHARS = '0123456789'
+ALNUMCHARS = NAMECHARS + NUMCHARS
+EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'
+WHITESPACES = ' \t\n\r\v\f'
+
+def match_encoding_declaration(comment):
+    """returns the declared encoding or None
+
+    This function is a replacement for :
+    >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
+    >>> py_encoding.search(comment)
+    """
+    index = comment.find('coding')
+    if index == -1:
+        return None
+    next_char = comment[index + 6]
+    if next_char not in ':=':
+        return None
+    end_of_decl = comment[index + 7:]
+    index = 0
+    for char in end_of_decl:
+        if char not in WHITESPACES:
+            break
+        index += 1
+    else:
+        return None
+    encoding = ''
+    for char in end_of_decl[index:]:
+        if char in EXTENDED_ALNUMCHARS:
+            encoding += char
+        else:
+            break
+    if encoding != '':
+        return encoding
+    return None
 
-DEBUG = False
-import re
-
-KEYWORDS = [
-    'and', 'assert', 'break', 'class', 'continue', 'def', 'del',
-    'elif', 'if', 'import', 'in', 'is', 'finally', 'for', 'from',
-    'global', 'else', 'except', 'exec', 'lambda', 'not', 'or',
-    'pass', 'print', 'raise', 'return', 'try', 'while', 'yield'
-    ]
-
-py_keywords = re.compile(r'(%s)$' % ('|'.join(KEYWORDS)), re.M | re.X)
-
-py_punct = re.compile(r"""
-<>|!=|==|~|
-<=|<<=|<<|<|
->=|>>=|>>|>|
-\*=|\*\*=|\*\*|\*|
-//=|/=|//|/|
-%=|\^=|\|=|\+=|=|&=|-=|
-,|\^|&|\+|-|\.|%|\||
-\)|\(|;|:|@|\[|\]|`|\{|\}
-""", re.M | re.X)
-
-g_symdef = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*:", re.M)
-g_string = re.compile(r"'[^']+'", re.M)
-py_name = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*", re.M)
-py_comment = re.compile(r"#.*$|[ \t\014]*$", re.M)
-py_ws = re.compile(r" *", re.M)
-py_skip = re.compile(r"[ \t\014]*(#.*$)?", re.M)
-py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")
-# py_number = re.compile(r"0x[0-9a-z]+|[0-9]+l|([0-9]+\.[0-9]*|\.[0-9]+|[0-9]+)(e[+-]?[0-9]+)?j?||[0-9]+", re.I)
-
-# 0x[\da-f]+l matches hexadecimal numbers, possibly defined as long
-# \d+l matches and only matches long integers
-# (\d+\.\d*|\.\d+|\d+)(e[+-]?\d+)?j? matches simple integers,
-#   exponential notations and complex
-py_number = re.compile(r"""0x[\da-f]+l?|
-\d+l|
-(\d+\.\d*|\.\d+|\d+)(e[+-]?\d+)?j?
-""", re.I | re.X)
 
 def _normalize_encoding(encoding):
     """returns normalized name for <encoding>
@@ -64,251 +61,9 @@
             return 'iso-8859-1'
     return encoding
 
-class PythonSource(TokenSource):
-    """The Python tokenizer"""
-    def __init__(self, inpstring):
-        TokenSource.__init__(self)
-        self.input = inpstring
-        self.pos = 0
-        self.indent = 0
-        self.indentstack = [ 0 ]
-        self.atbol = True
-        self.line = 1
-        self._current_line = 1
-        self.pendin = 0 # indentation change waiting to be reported
-        self.level = 0
-        self.linestart = 0
-        self.stack = []
-        self.stack_pos = 0
-        self.comment = ''
-        self.encoding = None
-
-        
-    def current_line(self):
-        return self._current_line
-
-    def context(self):
-        return self.stack_pos
-
-    def restore(self, ctx):
-        self.stack_pos = ctx
-
-    def offset(self, ctx=None):
-        if ctx is None:
-            return self.stack_pos
-        else:
-            assert type(ctx)==int
-            return ctx
-
-    def _next(self):
-        """returns the next token from source"""
-        inp = self.input
-        pos = self.pos
-        input_length = len(inp)
-        if pos >= input_length:
-            return self.end_of_file()
-        # Beginning of line
-        if self.atbol:
-            self.linestart = pos
-            col = 0
-            m = py_ws.match(inp, pos)
-            pos = m.end()
-            col = pos - self.linestart
-            self.atbol = False
-            # skip blanklines
-            m = py_comment.match(inp, pos)
-            if m:
-                if not self.comment:
-                    self.comment = m.group(0)
-                # <HACK> XXX FIXME: encoding management
-                if self.line <= 2:
-                    # self.comment can be the previous comment, so don't use it
-                    comment = m.group(0)[1:]
-                    m_enc = py_encoding.search(comment)
-                    if m_enc is not None:
-                        self.encoding = _normalize_encoding(m_enc.group(1))
-                # </HACK>
-                self.pos = m.end() + 1
-                self.line += 1
-                self.atbol = True
-                return self._next()
-            # the current block is more indented than the previous one
-            if col > self.indentstack[-1]:
-                self.indentstack.append(col)
-                return "INDENT", None
-            # the current block is less indented than the previous one
-            while col < self.indentstack[-1]:
-                self.pendin += 1
-                self.indentstack.pop(-1)
-            if col != self.indentstack[-1]:
-                raise SyntaxError("Indentation Error")
-        if self.pendin > 0:
-            self.pendin -= 1
-            return "DEDENT", None
-        m = py_skip.match(inp, pos)
-        if m.group(0)[-1:] == '\n':
-            self.line += 1
-        self.comment = m.group(1) or ''
-        pos = m.end() # always match
-        if pos >= input_length:
-            return self.end_of_file()
-        self.pos = pos
-
-        # STRING
-        c = inp[pos]
-        if c in ('r','R'):
-            if pos < input_length-1 and inp[pos+1] in ("'",'"'):
-                return self.next_string(raw=1)
-        elif c in ('u','U'):
-            if pos < input_length-1:
-                if inp[pos+1] in ("r",'R'):
-                    if pos<input_length-2 and inp[pos+2] in ("'",'"'):
-                        return self.next_string( raw = 1, uni = 1 )
-                elif inp[pos+1] in ( "'", '"' ):
-                    return self.next_string( uni = 1 )
-        elif c in ( '"', "'" ):
-            return self.next_string()
-
-        # NAME
-        m = py_name.match(inp, pos)
-        if m:
-            self.pos = m.end()
-            val = m.group(0)
-#            if py_keywords.match(val):
-#                return val, None
-            return "NAME", val
-
-        # NUMBER
-        m = py_number.match(inp, pos)
-        if m:
-            self.pos = m.end()
-            return "NUMBER", m.group(0)
-
-        # NEWLINE
-        if c == '\n':
-            self.pos += 1
-            self.line += 1
-            if self.level > 0:
-                return self._next()
-            else:
-                self.atbol = True
-                comment = self.comment
-                self.comment = ''
-                return "NEWLINE", comment
-
-        if c == '\\':
-            if pos < input_length-1 and inp[pos+1] == '\n':
-                self.pos += 2
-                return self._next()
-        
-        m = py_punct.match(inp, pos)
-        if m:
-            punct = m.group(0)
-            if punct in ( '(', '{', '[' ):
-                self.level += 1
-            if punct in ( ')', '}', ']' ):
-                self.level -= 1
-            self.pos = m.end()
-            return punct, None
-        raise SyntaxError("Unrecognized token '%s'" % inp[pos:pos+20] )
-
-    def next(self):
-        if self.stack_pos >= len(self.stack):
-            pos0 = self.pos
-            tok, val = self._next()
-            token = Token( tok, val )
-            self.stack.append( ( token, self.line, pos0) )
-            self._current_line = self.line
-        else:
-            token, line, pos0 = self.stack[self.stack_pos]
-            self._current_line = line
-        self.stack_pos += 1
-        if DEBUG:
-            print "%d/%d: %s, %s" % (self.stack_pos, len(self.stack), tok, val)
-        return token
-
-    def get_pos(self):
-        if self.stack_pos >= len(self.stack):
-            return self.pos
-        else:
-            token, line, pos = self.stack[self.stack_pos]
-            return pos
-
-    def get_source_text(self, pos0, pos1 ):
-        return self.input[pos0:pos1]
-
-    def peek(self):
-        """returns next token without consuming it"""
-        ctx = self.context()
-        token = self.next()
-        self.restore(ctx)
-        return token
-        
-            
-    def end_of_file(self):
-        """return DEDENT and ENDMARKER"""
-        if len(self.indentstack) == 1:
-            self.indentstack.pop(-1)
-            return "NEWLINE", '' #self.comment
-        elif len(self.indentstack) > 1:
-            self.indentstack.pop(-1)
-            return "DEDENT", None
-        return "ENDMARKER", None
-
-
-    def next_string(self, raw=0, uni=0):
-        pos = self.pos + raw + uni
-        inp = self.input
-        quote = inp[pos]
-        qsize = 1
-        if inp[pos:pos+3] == 3*quote:
-            pos += 3
-            quote = 3*quote
-            qsize = 3
-        else:
-            pos += 1
-        while True:
-            if inp[pos:pos+qsize] == quote:
-                s = inp[self.pos:pos+qsize]
-                self.pos = pos+qsize
-                return "STRING", s
-            # FIXME : shouldn't it be inp[pos] == os.linesep ?
-            if inp[pos:pos+2] == "\n" and qsize == 1:
-                return None, None
-            if inp[pos] == "\\":
-                pos += 1
-            pos += 1
-
-    def debug(self):
-        """return context for debug information"""
-        if not hasattr(self, '_lines'):
-            # split lines only once
-            self._lines = self.input.splitlines()
-        if self.line > len(self._lines):
-            lineno = len(self._lines)
-        else:
-            lineno = self.line
-        return 'line %s : %s' % (lineno, self._lines[lineno-1])
-
-
 ################################################################################
-class StringAsFile(object):
-    """XXX: Is StringIO RPython ?"""
-
-    def __init__(self, inpstring):
-        self.lines = inpstring.splitlines(True)
-        self.lineno = 0
-
-    def readline(self):
-        if self.lineno < len(self.lines):
-            line = self.lines[self.lineno]
-            self.lineno += 1
-            return line
-        return ''
-
-
 import token as tokenmod
-from pypy.module.parser.pytokenize import generate_tokens, tabsize, \
+from pypy.module.parser.pytokenize import tabsize, \
      whiteSpaceDFA, triple_quoted, endDFAs, single_quoted, pseudoDFA
 from pypy.module.parser import automata
 
@@ -347,8 +102,8 @@
     """
     token_list = []
     lnum = parenlev = continued = 0
-    namechars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
-    numchars = '0123456789'
+    namechars = NAMECHARS
+    numchars = NUMCHARS
     contstr, needcont = '', 0
     contline = None
     indents = [0]
@@ -405,10 +160,10 @@
                 if line[pos] == '#':
                     tok = token_from_values(tokenmod.COMMENT, line[pos:])
                     last_comment = line[pos:]
-                    if lnum <= 2:
-                        m_enc = py_encoding.search(last_comment)
-                        if m_enc is not None:
-                            encoding = _normalize_encoding(m_enc.group(1))
+                    if lnum <= 2 and encoding is None:
+                        encoding = match_encoding_declaration(last_comment)
+                        if encoding is not None:
+                            encoding = _normalize_encoding(encoding)
                 else:
                     tok = token_from_values(tokenmod.NL, line[pos:])
                     last_comment = ''
@@ -468,10 +223,10 @@
                 elif initial == '#':
                     tok = token_from_values(tokenmod.COMMENT, token)
                     last_comment = token
-                    if lnum <= 2:
-                        m_enc = py_encoding.search(last_comment)
-                        if m_enc is not None:
-                            encoding = _normalize_encoding(m_enc.group(1))
+                    if lnum <= 2 and encoding is None:
+                        encoding = match_encoding_declaration(last_comment)
+                        if encoding is not None:
+                            encoding = _normalize_encoding(encoding)
                     # XXX Skip # token_list.append((tok, line))
                     # token_list.append((COMMENT, token, spos, epos, line))
                 elif token in triple_quoted:
@@ -540,12 +295,10 @@
     # token_list.append((ENDMARKER, '', (lnum, 0), (lnum, 0), ''))
     return token_list, encoding
 
-
-class PythonSource2(TokenSource):
+class PythonSource(TokenSource):
     """This source uses Jonathan's tokenizer"""
     def __init__(self, inpstring):
         TokenSource.__init__(self)
-        # tokens, encoding = generate_tokens(StringAsFile(inpstring).readline)
         tokens, encoding = generate_tokens(inpstring.splitlines(True))
         self.token_stack = tokens
         self.encoding = encoding
@@ -611,7 +364,7 @@
         return Token('NEWLINE', '') # XXX pending comment ?
     return Token(tokenmod.tok_name[tok_type], tok_string)
 
-Source = PythonSource2
+Source = PythonSource
 
 def tokenize_file(filename):
     f = file(filename).read()