[pypy-commit] pypy py3k: space.wrap("xxx") now returns a unicode string!

Wed Oct 12 22:23:41 CEST 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r47999:ef504fcb59bb
Date: 2011-10-12 22:19 +0200
http://bitbucket.org/pypy/pypy/changeset/ef504fcb59bb/

Log:	space.wrap("xxx") now returns a unicode string!
	space.str_w(w_someunicode) return a RPython bytestring! (use
	space.wrapbytes and space.bytes_w to get the previous behavior)

diff --git a/pypy/interpreter/astcompiler/ast.py b/pypy/interpreter/astcompiler/ast.py
--- a/pypy/interpreter/astcompiler/ast.py
+++ b/pypy/interpreter/astcompiler/ast.py
@@ -1758,6 +1758,7 @@
     _col_offset_mask = 8
 
     def __init__(self, id, ctx, lineno, col_offset):
+        assert isinstance(id, str)
         self.id = id
         self.ctx = ctx
         expr.__init__(self, lineno, col_offset)
diff --git a/pypy/interpreter/astcompiler/astbuilder.py b/pypy/interpreter/astcompiler/astbuilder.py
--- a/pypy/interpreter/astcompiler/astbuilder.py
+++ b/pypy/interpreter/astcompiler/astbuilder.py
@@ -503,6 +503,8 @@
         return name
 
     def handle_arguments(self, arguments_node):
+        # This function handles both typedargslist (function definition)
+        # and varargslist (lambda definition).
         if arguments_node.type == syms.parameters:
             if len(arguments_node.children) == 2:
                 return ast.arguments(None, None, None, None)
@@ -517,7 +519,7 @@
         while i < child_count:
             argument = arguments_node.children[i]
             arg_type = argument.type
-            if arg_type == syms.tfpdef:
+            if arg_type == syms.tfpdef or arg_type == syms.vfpdef:
                 parenthesized = False
                 complex_args = False
                 while True:
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -188,13 +188,13 @@
 
     # -------------------------------------------------------------------
 
-    def str_w(self, space):
-        w_msg = typed_unwrap_error_msg(space, "string", self)
+    def bytes_w(self, space):
+        w_msg = typed_unwrap_error_msg(space, "bytes", self)
         raise OperationError(space.w_TypeError, w_msg)
 
     def unicode_w(self, space):
         raise OperationError(space.w_TypeError,
-                             typed_unwrap_error_msg(space, "unicode", self))
+                             typed_unwrap_error_msg(space, "string", self))
 
     def int_w(self, space):
         raise OperationError(space.w_TypeError,
@@ -1233,7 +1233,10 @@
         return self.str_w(w_obj)
 
     def str_w(self, w_obj):
-        return w_obj.str_w(self)
+        return self.unicode_w(w_obj).encode('ascii')
+
+    def bytes_w(self, w_obj):
+        return w_obj.bytes_w(self)
 
     def int_w(self, w_obj):
         return w_obj.int_w(self)
@@ -1561,7 +1564,7 @@
 
 ObjSpace.IrregularOpTable = [
     'wrap',
-    'str_w',
+    'bytes_w',
     'int_w',
     'float_w',
     'uint_w',
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -66,7 +66,7 @@
             exc_value    = str(w_value)
         else:
             w = space.wrap
-            if space.is_w(space.type(self.w_type), space.w_str):
+            if space.is_w(space.type(self.w_type), space.w_text):
                 exc_typename = space.str_w(self.w_type)
             else:
                 exc_typename = space.str_w(
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -721,7 +721,6 @@
     def IMPORT_NAME(self, nameindex, next_instr):
         space = self.space
         w_modulename = self.getname_w(nameindex)
-        modulename = self.space.str_w(w_modulename)
         w_fromlist = self.popvalue()
 
         w_flag = self.popvalue()
@@ -739,7 +738,6 @@
         w_locals = self.w_locals
         if w_locals is None:            # CPython does this
             w_locals = space.w_None
-        w_modulename = space.wrap(modulename)
         w_globals = self.w_globals
         if w_flag is None:
             w_obj = space.call_function(w_import, w_modulename, w_globals,
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -13,6 +13,8 @@
     rawmode = False
     unicode = True
 
+    assert isinstance(s, str)
+
     # string decoration handling
     o = ord(quote)
     isalpha = (o>=97 and o<=122) or (o>=65 and o<=90)
@@ -95,13 +97,13 @@
             w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, space.wrap(encoding))
             return w_v
         else:
-            return space.wrap(substr)
+            return space.wrapbytes(substr)
 
     enc = None
     if need_encoding:
          enc = encoding
     v = PyString_DecodeEscape(space, substr, enc)
-    return space.wrap(v)
+    return space.wrapbytes(v)
 
 def hexbyte(val):
     result = "%x" % val
diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -4,14 +4,16 @@
 from pypy.interpreter.astcompiler import consts
 
 
-_recode_to_utf8 = gateway.applevel(r'''
-    def _recode_to_utf8(text, encoding):
-        return unicode(text, encoding).encode("utf-8")
-''').interphook('_recode_to_utf8')
+def decode_source(space, bytes, encoding=None):
+    if encoding is None:
+        encoding = 'utf-8'
+    if encoding == 'utf-8':
+        return bytes
+    text = space.unicode_w(space.call_function(space.w_unicode,
+                                               space.wrapbytes(bytes),
+                                               space.wrap(encoding)))
+    return text.encode('utf-8')
 
-def recode_to_utf8(space, text, encoding):
-    return space.str_w(_recode_to_utf8(space, space.wrap(text),
-                                          space.wrap(encoding)))
 def _normalize_encoding(encoding):
     """returns normalized name for <encoding>
 
@@ -94,7 +96,7 @@
         self.space = space
         self.future_flags = future_flags
 
-    def parse_source(self, textsrc, compile_info):
+    def parse_source(self, bytessrc, compile_info):
         """Main entry point for parsing Python source.
 
         Everything from decoding the source to tokenizing to building the parse
@@ -102,38 +104,39 @@
         """
         # Detect source encoding.
         enc = None
-        if textsrc.startswith("\xEF\xBB\xBF"):
-            textsrc = textsrc[3:]
+        if bytessrc.startswith("\xEF\xBB\xBF"):
+            bytessrc = bytessrc[3:]
             enc = 'utf-8'
             # If an encoding is explicitly given check that it is utf-8.
-            decl_enc = _check_for_encoding(textsrc)
+            decl_enc = _check_for_encoding(bytessrc)
             if decl_enc and decl_enc != "utf-8":
                 raise error.SyntaxError("UTF-8 BOM with non-utf8 coding cookie",
                                         filename=compile_info.filename)
+            textsrc = decode_source(self.space, bytessrc, enc)
         elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
             enc = 'utf-8'
-            if _check_for_encoding(textsrc) is not None:
+            if _check_for_encoding(bytessrc) is not None:
                 raise error.SyntaxError("coding declaration in unicode string",
                                         filename=compile_info.filename)
+            textsrc = decode_source(self.space, bytessrc, enc)
         else:
-            enc = _normalize_encoding(_check_for_encoding(textsrc))
-            if enc is not None and enc not in ('utf-8', 'iso-8859-1'):
-                try:
-                    textsrc = recode_to_utf8(self.space, textsrc, enc)
-                except OperationError, e:
-                    # if the codec is not found, LookupError is raised.  we
-                    # check using 'is_w' not to mask potential IndexError or
-                    # KeyError
-                    space = self.space
-                    if e.match(space, space.w_LookupError):
-                        raise error.SyntaxError("Unknown encoding: %s" % enc,
-                                                filename=compile_info.filename)
-                    # Transform unicode errors into SyntaxError
-                    if e.match(space, space.w_UnicodeDecodeError):
-                        e.normalize_exception(space)
-                        w_message = space.str(e.get_w_value(space))
-                        raise error.SyntaxError(space.str_w(w_message))
-                    raise
+            enc = _normalize_encoding(_check_for_encoding(bytessrc))
+            try:
+                textsrc = decode_source(self.space, bytessrc, enc)
+            except OperationError, e:
+                # if the codec is not found, LookupError is raised.  we
+                # check using 'is_w' not to mask potential IndexError or
+                # KeyError
+                space = self.space
+                if e.match(space, space.w_LookupError):
+                    raise error.SyntaxError("Unknown encoding: %s" % enc,
+                                            filename=compile_info.filename)
+                # Transform unicode errors into SyntaxError
+                if e.match(space, space.w_UnicodeDecodeError):
+                    e.normalize_exception(space)
+                    w_message = space.str(e.get_w_value(space))
+                    raise error.SyntaxError(space.text_w(w_message))
+                raise
 
         f_flags, future_info = future.get_futures(self.future_flags, textsrc)
         compile_info.last_future_import = future_info
diff --git a/pypy/module/__builtin__/compiling.py b/pypy/module/__builtin__/compiling.py
--- a/pypy/module/__builtin__/compiling.py
+++ b/pypy/module/__builtin__/compiling.py
@@ -26,18 +26,14 @@
 
     ast_node = None
     w_ast_type = space.gettypeobject(ast.AST.typedef)
-    str_ = None
+    source_str = None
     if space.is_true(space.isinstance(w_source, w_ast_type)):
         ast_node = space.interp_w(ast.mod, w_source)
         ast_node.sync_app_attrs(space)
-    elif space.is_true(space.isinstance(w_source, space.w_unicode)):
-        w_utf_8_source = space.call_method(w_source, "encode",
-                                           space.wrap("utf-8"))
-        str_ = space.str_w(w_utf_8_source)
+    else:
+        source_str = space.str_w(w_source)
         # This flag tells the parser to reject any coding cookies it sees.
         flags |= consts.PyCF_SOURCE_IS_UTF8
-    else:
-        str_ = space.str_w(w_source)
 
     ec = space.getexecutioncontext()
     if flags & ~(ec.compiler.compiler_flags | consts.PyCF_ONLY_AST |
@@ -56,10 +52,10 @@
 
     if ast_node is None:
         if flags & consts.PyCF_ONLY_AST:
-            mod = ec.compiler.compile_to_ast(str_, filename, mode, flags)
+            mod = ec.compiler.compile_to_ast(source_str, filename, mode, flags)
             return space.wrap(mod)
         else:
-            code = ec.compiler.compile(str_, filename, mode, flags)
+            code = ec.compiler.compile(source_str, filename, mode, flags)
     else:
         code = ec.compiler.compile_ast(ast_node, filename, mode, flags)
     return space.wrap(code)
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -47,8 +47,8 @@
     # space.{get,set,del}attr()...
     # Note that if w_name is already a string (or a subclass of str),
     # it must be returned unmodified (and not e.g. unwrapped-rewrapped).
-    if not space.is_w(space.type(w_name), space.w_str):
-        name = space.str_w(w_name)    # typecheck
+    if not space.is_w(space.type(w_name), space.w_text):
+        name = space.text_w(w_name)   # typecheck
         w_name = space.wrap(name)     # rewrap as a real string
     return w_name
 
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -386,7 +386,7 @@
         state = space.fromcache(CodecState)
         func = getattr(runicode, rname)
         result = func(uni, len(uni), errors, state.encode_error_handler)
-        return space.newtuple([space.wrap(result), space.wrap(len(uni))])
+        return space.newtuple([space.wrapbytes(result), space.wrap(len(uni))])
     wrap_encoder.func_name = rname
     globals()[name] = wrap_encoder
 
diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -627,11 +627,11 @@
 
     def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason):
         # typechecking
-        space.realstr_w(w_encoding)
-        space.realstr_w(w_object)
+        space.text_w(w_encoding)
+        space.str_w(w_object)
         space.int_w(w_start)
         space.int_w(w_end)
-        space.realstr_w(w_reason)
+        space.text_w(w_reason)
         # assign attributes
         self.w_encoding = w_encoding
         self.w_object = w_object
diff --git a/pypy/module/marshal/interp_marshal.py b/pypy/module/marshal/interp_marshal.py
--- a/pypy/module/marshal/interp_marshal.py
+++ b/pypy/module/marshal/interp_marshal.py
@@ -28,7 +28,7 @@
 by dump(data, file)."""
     m = StringMarshaller(space, space.int_w(w_version))
     m.dump_w_obj(w_data)
-    return space.wrap(m.get_value())
+    return space.wrapbytes(m.get_value())
 
 def load(space, w_f):
     """Read one value from the file 'f' and return it."""
diff --git a/pypy/module/sys/__init__.py b/pypy/module/sys/__init__.py
--- a/pypy/module/sys/__init__.py
+++ b/pypy/module/sys/__init__.py
@@ -42,7 +42,7 @@
         'argv'                  : 'state.get(space).w_argv',
         'py3kwarning'           : 'space.w_False',
         'warnoptions'           : 'state.get(space).w_warnoptions', 
-        'builtin_module_names'  : 'state.w_None',
+        'builtin_module_names'  : 'space.w_None',
         'pypy_getudir'          : 'state.pypy_getudir',    # not translated
         'pypy_initial_path'     : 'state.pypy_initial_path',
 
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -431,7 +431,7 @@
 
     def is_correct_type(self, w_obj):
         space = self.space
-        return space.is_w(space.type(w_obj), space.w_str)
+        return space.is_w(space.type(w_obj), space.w_text)
 
     def get_empty_storage(self):
         res = {}
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -68,6 +68,7 @@
             w_type = self.gettypeobject(typedef)
             self.builtin_types[typedef.name] = w_type
             setattr(self, 'w_' + typedef.name, w_type)
+        self.w_text = self.w_unicode
         self.builtin_types["NotImplemented"] = self.w_NotImplemented
         self.builtin_types["Ellipsis"] = self.w_Ellipsis
 
@@ -149,6 +150,9 @@
         assert typedef is not None
         return self.fromcache(stdtypedef.TypeCache).getorbuild(typedef)
 
+    def wrapbytes(self, bytes):
+        return wrapstr(self, bytes)
+
     def wrap(self, x):
         "Wraps the Python value 'x' into one of the wrapper classes."
         # You might notice that this function is rather conspicuously
@@ -170,7 +174,7 @@
             else:
                 return self.newint(x)
         if isinstance(x, str):
-            return wrapstr(self, x)
+            return wrapunicode(self, x.decode('ascii'))
         if isinstance(x, unicode):
             return wrapunicode(self, x)
         if isinstance(x, float):
diff --git a/pypy/objspace/std/ropeunicodeobject.py b/pypy/objspace/std/ropeunicodeobject.py
--- a/pypy/objspace/std/ropeunicodeobject.py
+++ b/pypy/objspace/std/ropeunicodeobject.py
@@ -78,9 +78,6 @@
         # for testing
         return w_self._node.flatten_unicode()
 
-    def str_w(w_self, space):
-        return space.str_w(space.str(w_self))
-
     def create_if_subclassed(w_self):
         if type(w_self) is W_RopeUnicodeObject:
             return w_self
diff --git a/pypy/objspace/std/strsliceobject.py b/pypy/objspace/std/strsliceobject.py
--- a/pypy/objspace/std/strsliceobject.py
+++ b/pypy/objspace/std/strsliceobject.py
@@ -30,7 +30,7 @@
         w_self.stop = len(str)
         return str
 
-    def str_w(w_self, space):
+    def bytes_w(w_self, space):
         return w_self.force()
 
     def __repr__(w_self):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -39,9 +39,6 @@
             return w_self
         return W_UnicodeObject(w_self._value)
 
-    def str_w(self, space):
-        return space.str_w(space.str(self))
-
     def unicode_w(self, space):
         return self._value
 
diff --git a/pypy/objspace/std/unicodetype.py b/pypy/objspace/std/unicodetype.py
--- a/pypy/objspace/std/unicodetype.py
+++ b/pypy/objspace/std/unicodetype.py
@@ -25,7 +25,7 @@
                     space.w_UnicodeDecodeError,
                     space.newtuple([
                     space.wrap('ascii'),
-                    space.wrap(s),
+                    space.wrapbytes(s),
                     space.wrap(i),
                     space.wrap(i+1),
                     space.wrap("ordinal not in range(128)")]))
@@ -191,7 +191,7 @@
                                        startingpos, endingpos):
         raise OperationError(space.w_UnicodeDecodeError,
                              space.newtuple([space.wrap(encoding),
-                                             space.wrap(s),
+                                             space.wrapbytes(s),
                                              space.wrap(startingpos),
                                              space.wrap(endingpos),
                                              space.wrap(msg)]))