[pypy-svn] r66294 - in pypy/branch/parser-compiler/pypy: interpreter interpreter/astcompiler interpreter/pyparser interpreter/pyparser/test module/__builtin__

benjamin at codespeak.net benjamin at codespeak.net
Thu Jul 16 17:37:48 CEST 2009


Author: benjamin
Date: Thu Jul 16 17:37:47 2009
New Revision: 66294

Modified:
   pypy/branch/parser-compiler/pypy/interpreter/astcompiler/consts.py
   pypy/branch/parser-compiler/pypy/interpreter/pycompiler.py
   pypy/branch/parser-compiler/pypy/interpreter/pyparser/pyparse.py
   pypy/branch/parser-compiler/pypy/interpreter/pyparser/pytokenizer.py
   pypy/branch/parser-compiler/pypy/interpreter/pyparser/test/test_pyparse.py
   pypy/branch/parser-compiler/pypy/module/__builtin__/compiling.py
Log:
indicate with a parser flag if the source is unicode

This also moves compiling to astcompiler.consts.


Modified: pypy/branch/parser-compiler/pypy/interpreter/astcompiler/consts.py
==============================================================================
--- pypy/branch/parser-compiler/pypy/interpreter/astcompiler/consts.py	(original)
+++ pypy/branch/parser-compiler/pypy/interpreter/astcompiler/consts.py	Thu Jul 16 17:37:47 2009
@@ -22,3 +22,6 @@
 CO_FUTURE_DIVISION = 0x2000
 CO_FUTURE_ABSOLUTE_IMPORT = 0x4000
 CO_FUTURE_WITH_STATEMENT = 0x8000
+
+PyCF_SOURCE_IS_UTF8 = 0x0100
+PyCF_DONT_IMPLY_DEDENT = 0x0200

Modified: pypy/branch/parser-compiler/pypy/interpreter/pycompiler.py
==============================================================================
--- pypy/branch/parser-compiler/pypy/interpreter/pycompiler.py	(original)
+++ pypy/branch/parser-compiler/pypy/interpreter/pycompiler.py	Thu Jul 16 17:37:47 2009
@@ -4,7 +4,7 @@
 """
 
 import sys
-from codeop import PyCF_DONT_IMPLY_DEDENT
+from pypy.interpreter.astcompiler.consts import PyCF_DONT_IMPLY_DEDENT
 from pypy.interpreter.error import OperationError
 
 class AbstractCompiler(object):

Modified: pypy/branch/parser-compiler/pypy/interpreter/pyparser/pyparse.py
==============================================================================
--- pypy/branch/parser-compiler/pypy/interpreter/pyparser/pyparse.py	(original)
+++ pypy/branch/parser-compiler/pypy/interpreter/pyparser/pyparse.py	Thu Jul 16 17:37:47 2009
@@ -1,8 +1,7 @@
-import codeop
 from pypy.interpreter import gateway
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.pyparser import parser, pytokenizer, pygram, error
-from pypy.interpreter.astcompiler.consts import CO_FUTURE_WITH_STATEMENT
+from pypy.interpreter.astcompiler import consts
 
 
 _recode_to_utf8 = gateway.applevel(r'''
@@ -82,13 +81,18 @@
         """Parse a python source according to goal"""
         # Detect source encoding.
         enc = None
-        if textsrc[:3] == '\xEF\xBB\xBF':
+        if textsrc.startswith("\xEF\xBB\xBF"):
             textsrc = textsrc[3:]
             enc = 'utf-8'
-            # check that there is no explicit encoding declared
+            # If an encoding is explicitly given check that it is utf-8.
             decl_enc = _check_for_encoding(textsrc)
-            if decl_enc is not None:
-                raise SyntaxError("encoding declaration in Unicode string")
+            if decl_enc and decl_enc != "utf-8":
+                raise error.SyntaxError("UTF-8 BOM with non-utf8 coding cookie")
+        elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
+            enc = 'utf-8'
+
+            if _check_for_encoding(textsrc) is not None:
+                raise error.SyntaxError("coding declaration in unicode string")
         else:
             enc = _normalize_encoding(_check_for_encoding(textsrc))
             if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
@@ -104,7 +108,7 @@
                     raise
 
         flags = compile_info.flags
-        if flags & CO_FUTURE_WITH_STATEMENT:
+        if flags & consts.CO_FUTURE_WITH_STATEMENT:
             self.grammar = pygram.python_grammar
         else:
             self.grammar = pygram.python_grammar_no_with_statement
@@ -112,7 +116,7 @@
         if source_lines and not source_lines[-1].endswith("\n"):
             source_lines[-1] += '\n'
         if textsrc and textsrc[-1] == "\n":
-            flags &= ~codeop.PyCF_DONT_IMPLY_DEDENT
+            flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
 
         self.prepare(_targets[compile_info.mode])
         try:

Modified: pypy/branch/parser-compiler/pypy/interpreter/pyparser/pytokenizer.py
==============================================================================
--- pypy/branch/parser-compiler/pypy/interpreter/pyparser/pytokenizer.py	(original)
+++ pypy/branch/parser-compiler/pypy/interpreter/pyparser/pytokenizer.py	Thu Jul 16 17:37:47 2009
@@ -1,10 +1,10 @@
-import codeop
 from pypy.interpreter.pyparser import automata
 from pypy.interpreter.pyparser.pygram import tokens
 from pypy.interpreter.pyparser.pytoken import python_opmap
 from pypy.interpreter.pyparser.error import TokenError, TokenIndentationError
 from pypy.interpreter.pyparser.pytokenize import tabsize, whiteSpaceDFA, \
     triple_quoted, endDFAs, single_quoted, pseudoDFA
+from pypy.interpreter.astcompiler import consts
 
 NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
 NUMCHARS = '0123456789'
@@ -231,7 +231,7 @@
                 pos = pos + 1
 
     lnum -= 1
-    if not (flags & codeop.PyCF_DONT_IMPLY_DEDENT):
+    if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
         if token_list and token_list[-1][0] != tokens.NEWLINE:
             tok = (tokens.NEWLINE, '', lnum, 0, '\n')
             token_list.append(tok)

Modified: pypy/branch/parser-compiler/pypy/interpreter/pyparser/test/test_pyparse.py
==============================================================================
--- pypy/branch/parser-compiler/pypy/interpreter/pyparser/test/test_pyparse.py	(original)
+++ pypy/branch/parser-compiler/pypy/interpreter/pyparser/test/test_pyparse.py	Thu Jul 16 17:37:47 2009
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
-import codeop
 import py
 from pypy.interpreter.pyparser import pyparse
 from pypy.interpreter.pyparser.pygram import syms, tokens
 from pypy.interpreter.pyparser.error import SyntaxError, IndentationError
+from pypy.interpreter.astcompiler import consts
 
 
 class TestPythonParser:
@@ -22,7 +22,7 @@
 
     def test_dont_imply_dedent(self):
         info = pyparse.CompileInfo("<test>", "single",
-                                   codeop.PyCF_DONT_IMPLY_DEDENT)
+                                   consts.PyCF_DONT_IMPLY_DEDENT)
         self.parse('if 1:\n  x\n', info=info)
         self.parse('x = 5 ', info=info)
 
@@ -42,6 +42,17 @@
         input = (u"# coding: utf-7\nstuff = %s" % (sentence,)).encode("utf-7")
         tree = self.parse(input, info=info)
         assert info.encoding == "utf-7"
+        input = "\xEF\xBB\xBF# coding: utf-8\nx"
+        self.parse(input, info=info)
+        assert info.encoding == "utf-8"
+        input = "# coding: utf-8\nx"
+        info.flags |= consts.PyCF_SOURCE_IS_UTF8
+        exc = py.test.raises(SyntaxError, self.parse, input, info=info).value
+        info.flags &= ~consts.PyCF_SOURCE_IS_UTF8
+        assert exc.msg == "coding declaration in unicode string"
+        input = "\xEF\xBB\xBF# coding: latin-1\nx"
+        exc = py.test.raises(SyntaxError, self.parse, input).value
+        assert exc.msg == "UTF-8 BOM with non-utf8 coding cookie"
         input = "# coding: not-here"
         exc = py.test.raises(SyntaxError, self.parse, input).value
         assert exc.msg == "Unknown encoding: not-here"

Modified: pypy/branch/parser-compiler/pypy/module/__builtin__/compiling.py
==============================================================================
--- pypy/branch/parser-compiler/pypy/module/__builtin__/compiling.py	(original)
+++ pypy/branch/parser-compiler/pypy/module/__builtin__/compiling.py	Thu Jul 16 17:37:47 2009
@@ -4,7 +4,8 @@
 
 from pypy.interpreter.pycode import PyCode
 from pypy.interpreter.baseobjspace import W_Root, ObjSpace
-from pypy.interpreter.error import OperationError 
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.astcompiler import consts
 from pypy.interpreter.gateway import NoneNotWrapped
 
 def compile(space, w_source, filename, mode, flags=0, dont_inherit=0):
@@ -21,11 +22,11 @@
 in addition to any features explicitly specified.
 """
     if space.is_true(space.isinstance(w_source, space.w_unicode)):
-        # hack: encode the unicode string as UTF-8 and attach
-        # a BOM at the start
-        w_source = space.call_method(w_source, 'encode', space.wrap('utf-8'))
-        str_ = space.str_w(w_source)
-        str_ = '\xEF\xBB\xBF' + str_
+        w_utf_8_source = space.call_method(w_source, "encode",
+                                           space.wrap("utf-8"))
+        str_ = space.str_w(w_utf_8_source)
+        # This flag tells the parser to reject any coding cookies it sees.
+        flags |= consts.PyCF_SOURCE_IS_UTF8
     else:
         str_ = space.str_w(w_source)
 



More information about the Pypy-commit mailing list