[pypy-svn] r75737 - in pypy/trunk/pypy: interpreter interpreter/pyparser module/_codecs module/_codecs/test objspace/std rlib

afa at codespeak.net afa at codespeak.net
Thu Jul 1 22:24:33 CEST 2010


Author: afa
Date: Thu Jul  1 22:24:32 2010
New Revision: 75737

Removed:
   pypy/trunk/pypy/module/_codecs/app_codecs.py
Modified:
   pypy/trunk/pypy/interpreter/pyparser/parsestring.py
   pypy/trunk/pypy/interpreter/unicodehelper.py
   pypy/trunk/pypy/module/_codecs/__init__.py
   pypy/trunk/pypy/module/_codecs/interp_codecs.py
   pypy/trunk/pypy/module/_codecs/test/test_codecs.py
   pypy/trunk/pypy/objspace/std/marshal_impl.py
   pypy/trunk/pypy/objspace/std/unicodeobject.py
   pypy/trunk/pypy/rlib/rstring.py
   pypy/trunk/pypy/rlib/runicode.py
Log:
Merge branch/interplevel-codecs:
Rewrite all encodings implemented at applevel,
move most of them to rlib.runicode where they may be useful for RPython programs.

- This fixes translation with -O0: this option disables geninterp,
and uses the pypy compiler instead to compile applevel code.
But unicode literals need to be decoded with the codec module...

- This also removes some huge geninterp'd code: app_codecs.py used to contain 3
or 4 functions each rendered with 20000 lines of C code!

+ use StringBuilder everywhere



Modified: pypy/trunk/pypy/interpreter/pyparser/parsestring.py
==============================================================================
--- pypy/trunk/pypy/interpreter/pyparser/parsestring.py	(original)
+++ pypy/trunk/pypy/interpreter/pyparser/parsestring.py	Thu Jul  1 22:24:32 2010
@@ -72,12 +72,12 @@
             bufp = 0
             bufq = len(buf)
         assert 0 <= bufp <= bufq
-        w_substr = space.wrap(buf[bufp : bufq])
+        substr = buf[bufp:bufq]
         if rawmode:
-            w_v = unicodehelper.PyUnicode_DecodeRawUnicodeEscape(space, w_substr)
+            v = unicodehelper.PyUnicode_DecodeRawUnicodeEscape(space, substr)
         else:
-            w_v = unicodehelper.PyUnicode_DecodeUnicodeEscape(space, w_substr)
-        return w_v
+            v = unicodehelper.PyUnicode_DecodeUnicodeEscape(space, substr)
+        return space.wrap(v)
 
     need_encoding = (encoding is not None and
                      encoding != "utf-8" and encoding != "iso-8859-1")
@@ -86,7 +86,7 @@
     substr = s[ps : q]
     if rawmode or '\\' not in s[ps:]:
         if need_encoding:
-            w_u = unicodehelper.PyUnicode_DecodeUTF8(space, space.wrap(substr))
+            w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, substr))
             #w_v = space.wrap(space.unwrap(w_u).encode(encoding)) this works
             w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, space.wrap(encoding))
             return w_v
@@ -96,7 +96,7 @@
     enc = None
     if need_encoding:
          enc = encoding
-    v = PyString_DecodeEscape(space, substr, unicode, enc)
+    v = PyString_DecodeEscape(space, substr, enc)
     return space.wrap(v)
 
 def hexbyte(val):
@@ -105,10 +105,9 @@
         result = "0" + result
     return result
 
-def PyString_DecodeEscape(space, s, unicode, recode_encoding):
+def PyString_DecodeEscape(space, s, recode_encoding):
     """
-    Unescape a backslash-escaped string. If unicode is non-zero,
-    the string is a u-literal. If recode_encoding is non-zero,
+    Unescape a backslash-escaped string. If recode_encoding is non-zero,
     the string is UTF-8 encoded and should be re-encoded in the
     specified encoding.
     """
@@ -171,9 +170,6 @@
                 raise_app_valueerror(space, 'invalid \\x escape')
             # ignored replace and ignore for now
 
-        elif unicode and (ch == 'u' or ch == 'U' or ch == 'N'):
-            raise_app_valueerror(space, 'Unicode escapes not legal '
-                                        'when Unicode disabled')
         else:
             # this was not an escape, so the backslash
             # has to be added, and we start over in
@@ -200,7 +196,7 @@
     # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
     while ps < end and ord(s[ps]) & 0x80:
         ps += 1
-    w_u = unicodehelper.PyUnicode_DecodeUTF8(space, space.wrap(s[pt : ps]))
+    w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, s[pt:ps]))
     w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, space.wrap(encoding))
     v = space.str_w(w_v)
     return v, ps

Modified: pypy/trunk/pypy/interpreter/unicodehelper.py
==============================================================================
--- pypy/trunk/pypy/interpreter/unicodehelper.py	(original)
+++ pypy/trunk/pypy/interpreter/unicodehelper.py	Thu Jul  1 22:24:32 2010
@@ -1,30 +1,10 @@
-from pypy.interpreter import gateway
+from pypy.module._codecs import interp_codecs
 
-app = gateway.applevel(r'''
-    def PyUnicode_DecodeUnicodeEscape(data):
-        import _codecs
-        return _codecs.unicode_escape_decode(data)[0]
+def PyUnicode_AsEncodedString(space, w_data, w_encoding):
+    return interp_codecs.encode(space, w_data, w_encoding)
 
-    def PyUnicode_DecodeRawUnicodeEscape(data):
-        import _codecs
-        return _codecs.raw_unicode_escape_decode(data)[0]
-
-    def PyUnicode_DecodeUTF8(data):
-        import _codecs
-        return _codecs.utf_8_decode(data)[0]
-
-    def PyUnicode_AsEncodedString(data, encoding):
-        import _codecs
-        return _codecs.encode(data, encoding)
-
-    def PyUnicode_EncodeUTF8(data):
-        import _codecs
-        return _codecs.utf_8_encode(data)[0]
-
-''')
-
-PyUnicode_DecodeUnicodeEscape = app.interphook('PyUnicode_DecodeUnicodeEscape')
-PyUnicode_DecodeRawUnicodeEscape = app.interphook('PyUnicode_DecodeRawUnicodeEscape')
-PyUnicode_DecodeUTF8 = app.interphook('PyUnicode_DecodeUTF8')
-PyUnicode_AsEncodedString = app.interphook('PyUnicode_AsEncodedString')
-PyUnicode_EncodeUTF8 = app.interphook('PyUnicode_EncodeUTF8') 
+# These functions take and return unwrapped rpython strings and unicodes
+PyUnicode_DecodeUnicodeEscape = interp_codecs.make_raw_decoder('unicode_escape')
+PyUnicode_DecodeRawUnicodeEscape = interp_codecs.make_raw_decoder('raw_unicode_escape')
+PyUnicode_DecodeUTF8 = interp_codecs.make_raw_decoder('utf_8')
+PyUnicode_EncodeUTF8 = interp_codecs.make_raw_encoder('utf_8')

Modified: pypy/trunk/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/__init__.py	(original)
+++ pypy/trunk/pypy/module/_codecs/__init__.py	Thu Jul  1 22:24:32 2010
@@ -3,22 +3,43 @@
 from pypy.module._codecs import interp_codecs
 
 class Module(MixedModule):
-    appleveldefs = {
-         '__doc__' :  'app_codecs.__doc__',
-         '__name__' :  'app_codecs.__name__',
-         'charmap_encode' :  'app_codecs.charmap_encode',
-         'escape_decode' :  'app_codecs.escape_decode',
-         'escape_encode' :  'app_codecs.escape_encode',
-         'raw_unicode_escape_decode' :  'app_codecs.raw_unicode_escape_decode',
-         'raw_unicode_escape_encode' :  'app_codecs.raw_unicode_escape_encode',
-         'unicode_escape_decode' :  'app_codecs.unicode_escape_decode',
-         'unicode_escape_encode' :  'app_codecs.unicode_escape_encode',
-         'unicode_internal_decode' :  'app_codecs.unicode_internal_decode',
-         'unicode_internal_encode' :  'app_codecs.unicode_internal_encode',
-         'utf_7_decode' :  'app_codecs.utf_7_decode',
-         'utf_7_encode' :  'app_codecs.utf_7_encode',
-         'charmap_build' : 'app_codecs.charmap_build'
-    }
+    """
+   _codecs -- Provides access to the codec registry and the builtin
+              codecs.
+
+   This module should never be imported directly. The standard library
+   module "codecs" wraps this builtin module for use within Python.
+
+   The codec registry is accessible via:
+
+     register(search_function) -> None
+
+     lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
+
+   The builtin Unicode codecs use the following interface:
+
+     <encoding>_encode(Unicode_object[,errors='strict']) -> 
+         (string object, bytes consumed)
+
+     <encoding>_decode(char_buffer_obj[,errors='strict']) -> 
+        (Unicode object, bytes consumed)
+
+   <encoding>_encode() interfaces also accept non-Unicode object as
+   input. The objects are then converted to Unicode using
+   PyUnicode_FromObject() prior to applying the conversion.
+
+   These <encoding>s are available: utf_8, unicode_escape,
+   raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
+   mbcs (on win32).
+
+
+Written by Marc-Andre Lemburg (mal at lemburg.com).
+
+Copyright (c) Corporation for National Research Initiatives.
+"""
+
+    appleveldefs = {}
+
     interpleveldefs = {
          'encode':         'interp_codecs.encode',
          'decode':         'interp_codecs.decode',
@@ -26,12 +47,15 @@
          'lookup_error':   'interp_codecs.lookup_error',
          'register':       'interp_codecs.register_codec',
          'register_error': 'interp_codecs.register_error',
+         'charmap_build' : 'interp_codecs.charmap_build',
 
          # encoders and decoders
          'ascii_decode'     : 'interp_codecs.ascii_decode',
          'ascii_encode'     : 'interp_codecs.ascii_encode',
          'latin_1_decode'   : 'interp_codecs.latin_1_decode',
          'latin_1_encode'   : 'interp_codecs.latin_1_encode',
+         'utf_7_decode'     : 'interp_codecs.utf_7_decode',
+         'utf_7_encode'     : 'interp_codecs.utf_7_encode',
          'utf_8_decode'     : 'interp_codecs.utf_8_decode',
          'utf_8_encode'     : 'interp_codecs.utf_8_encode',
          'utf_16_be_decode' : 'interp_codecs.utf_16_be_decode',
@@ -44,6 +68,15 @@
          'charbuffer_encode': 'interp_codecs.buffer_encode',
          'readbuffer_encode': 'interp_codecs.buffer_encode',
          'charmap_decode'   : 'interp_codecs.charmap_decode',
+         'charmap_encode'   : 'interp_codecs.charmap_encode',
+         'escape_encode'    : 'interp_codecs.escape_encode',
+         'escape_decode'    : 'interp_codecs.escape_decode',
+         'unicode_escape_decode'     :  'interp_codecs.unicode_escape_decode',
+         'unicode_escape_encode'     :  'interp_codecs.unicode_escape_encode',
+         'raw_unicode_escape_decode' :  'interp_codecs.raw_unicode_escape_decode',
+         'raw_unicode_escape_encode' :  'interp_codecs.raw_unicode_escape_encode',
+         'unicode_internal_decode'   :  'interp_codecs.unicode_internal_decode',
+         'unicode_internal_encode'   :  'interp_codecs.unicode_internal_encode',
     }
 
     def __init__(self, space, *args):

Modified: pypy/trunk/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/interp_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/interp_codecs.py	Thu Jul  1 22:24:32 2010
@@ -1,5 +1,6 @@
 from pypy.interpreter.error import OperationError, operationerrfmt
-from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, applevel
+from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, interp2app
+from pypy.interpreter.gateway import unwrap_spec
 from pypy.interpreter.baseobjspace import W_Root
 from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
 from pypy.rlib.objectmodel import we_are_translated
@@ -13,6 +14,8 @@
         self.decode_error_handler = self.make_errorhandler(space, True)
         self.encode_error_handler = self.make_errorhandler(space, False)
 
+        self.unicodedata_handler = None
+
     def make_errorhandler(self, space, decode):
         def unicode_call_errorhandler(errors,  encoding, reason, input,
                                       startpos, endpos):
@@ -53,6 +56,21 @@
                 return replace, newpos
         return unicode_call_errorhandler
 
+    def get_unicodedata_handler(self, space):
+        if self.unicodedata_handler:
+            return self.unicodedata_handler
+        try:
+            w_builtin = space.getbuiltinmodule('__builtin__')
+            w_import = space.getattr(w_builtin, space.wrap("__import__"))
+            w_unicodedata = space.call_function(w_import,
+                                                space.wrap("unicodedata"))
+            w_getcode = space.getattr(w_unicodedata, space.wrap("_get_code"))
+        except OperationError:
+            return None
+        else:
+            self.unicodedata_handler = UnicodeData_Handler(space, w_getcode)
+            return self.unicodedata_handler
+
     def _freeze_(self):
         assert not self.codec_search_path
         return False
@@ -114,78 +132,125 @@
         "unknown encoding: %s", encoding)
 lookup_codec.unwrap_spec = [ObjSpace, str]
 
-app_errors = applevel("""
-def check_exception(exc):
+# ____________________________________________________________
+# Register standard error handlers
+
+def check_exception(space, w_exc):
     try:
-        delta = exc.end - exc.start
-        if delta < 0 or not isinstance(exc.object, (unicode, str)):
-            raise TypeError("wrong exception")
-    except AttributeError:
-        raise TypeError("wrong exception")
-
-def strict_errors(exc):
-    if isinstance(exc, Exception):
-        raise exc
-    else:
-        raise TypeError("codec must pass exception instance")
-
-def ignore_errors(exc):
-    check_exception(exc)
-    if isinstance(exc, UnicodeEncodeError):
-        return u'', exc.end
-    elif isinstance(exc, (UnicodeDecodeError, UnicodeTranslateError)):
-        return u'', exc.end
-    else: 
-        raise TypeError("don't know how to handle %.400s in error callback"%exc)
-
-Py_UNICODE_REPLACEMENT_CHARACTER = u"\ufffd"
-
-def replace_errors(exc):
-    check_exception(exc)
-    if isinstance(exc, UnicodeEncodeError):
-        return u'?'*(exc.end-exc.start), exc.end
-    elif isinstance(exc, (UnicodeTranslateError, UnicodeDecodeError)):
-        return Py_UNICODE_REPLACEMENT_CHARACTER*(exc.end-exc.start), exc.end
-    else:
-        raise TypeError("don't know how to handle %.400s in error callback"%exc)
-
-def xmlcharrefreplace_errors(exc):
-    if isinstance(exc, UnicodeEncodeError):
-        res = []
-        for ch in exc.object[exc.start:exc.end]:
-            res += '&#'
-            res += str(ord(ch))
-            res += ';'
-        return u''.join(res), exc.end
-    else:
-        raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
-
-def backslashreplace_errors(exc):
-    if isinstance(exc, UnicodeEncodeError):
-        p = []
-        for c in exc.object[exc.start:exc.end]:
-            p += '\\\\'
-            oc = ord(c)
-            if (oc >= 0x00010000):
-                p += 'U'
-                p += "%.8x" % ord(c)
+        w_start = space.getattr(w_exc, space.wrap('start'))
+        w_end = space.getattr(w_exc, space.wrap('end'))
+        w_obj = space.getattr(w_exc, space.wrap('object'))
+    except OperationError, e:
+        if not e.match(space, space.w_AttributeError):
+            raise
+        raise OperationError(space.w_TypeError, space.wrap(
+            "wrong exception"))
+
+    delta = space.int_w(w_end) - space.int_w(w_start)
+    if delta < 0 or not (space.isinstance_w(w_obj, space.w_str) or
+                         space.isinstance_w(w_obj, space.w_unicode)):
+        raise OperationError(space.w_TypeError, space.wrap(
+            "wrong exception"))
+
+def strict_errors(space, w_exc):
+    check_exception(space, w_exc)
+    if space.isinstance_w(w_exc, space.w_BaseException):
+        raise OperationError(space.type(w_exc), w_exc)
+    else:
+        raise OperationError(space.w_TypeError, space.wrap(
+            "codec must pass exception instance"))
+
+def ignore_errors(space, w_exc):
+    check_exception(space, w_exc)
+    w_end = space.getattr(w_exc, space.wrap('end'))
+    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+        return space.newtuple([space.wrap(''), w_end])
+    elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
+          space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+        return space.newtuple([space.wrap(u''), w_end])
+    else:
+        typename = space.type(w_exc).getname(space, '?')
+        raise operationerrfmt(space.w_TypeError,
+            "don't know how to handle %s in error callback", typename)
+
+def replace_errors(space, w_exc):
+    check_exception(space, w_exc)
+    w_start = space.getattr(w_exc, space.wrap('start'))
+    w_end = space.getattr(w_exc, space.wrap('end'))
+    size = space.int_w(w_end) - space.int_w(w_start)
+    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+        text = '?' * size
+        return space.newtuple([space.wrap(text), w_end])
+    elif (space.isinstance_w(w_exc, space.w_UnicodeDecodeError) or
+          space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+        text = u'\ufffd' * size
+        return space.newtuple([space.wrap(text), w_end])
+    else:
+        typename = space.type(w_exc).getname(space, '?')
+        raise operationerrfmt(space.w_TypeError,
+            "don't know how to handle %s in error callback", typename)
+
+def xmlcharrefreplace_errors(space, w_exc):
+    check_exception(space, w_exc)
+    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+        obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+        start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+        w_end = space.getattr(w_exc, space.wrap('end'))
+        end = space.int_w(w_end)
+        builder = UnicodeBuilder()
+        pos = start
+        while pos < end:
+            ch = obj[pos]
+            builder.append(u"&#")
+            builder.append(unicode(str(ord(ch))))
+            builder.append(u";")
+            pos += 1
+        return space.newtuple([space.wrap(builder.build()), w_end])
+    else:
+        typename = space.type(w_exc).getname(space, '?')
+        raise operationerrfmt(space.w_TypeError,
+            "don't know how to handle %s in error callback", typename)
+
+def backslashreplace_errors(space, w_exc):
+    check_exception(space, w_exc)
+    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+        obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+        start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+        w_end = space.getattr(w_exc, space.wrap('end'))
+        end = space.int_w(w_end)
+        builder = UnicodeBuilder()
+        pos = start
+        while pos < end:
+            oc = ord(obj[pos])
+            num = hex(oc)
+            if (oc >= 0x10000):
+                builder.append(u"\\U")
+                zeros = 8
             elif (oc >= 0x100):
-                p += 'u'
-                p += "%.4x" % ord(c)
+                builder.append(u"\\u")
+                zeros = 4
             else:
-                p += 'x'
-                p += "%.2x" % ord(c)
-        return u''.join(p), exc.end
-    else:
-        raise TypeError("don't know how to handle %.400s in error callback"%type(exc))
-""")
+                builder.append(u"\\x")
+                zeros = 2
+            lnum = len(num)
+            nb = zeros + 2 - lnum # num starts with '0x'
+            if nb > 0:
+                builder.append_multiple_char(u'0', nb)
+            builder.append_slice(unicode(num), 2, lnum)
+            pos += 1
+        return space.newtuple([space.wrap(builder.build()), w_end])
+    else:
+        typename = space.type(w_exc).getname(space, '?')
+        raise operationerrfmt(space.w_TypeError,
+            "don't know how to handle %s in error callback", typename)
 
 def register_builtin_error_handlers(space):
+    "NOT_RPYTHON"
     state = space.fromcache(CodecState)
     for error in ("strict", "ignore", "replace", "xmlcharrefreplace",
                   "backslashreplace"):
         name = error + "_errors"
-        state.codec_error_registry[error] = app_errors.wget(space, name)
+        state.codec_error_registry[error] = space.wrap(interp2app(globals()[name]))
 
 
 def lookup_error(space, errors):
@@ -279,6 +344,38 @@
 
 from pypy.rlib import runicode
 
+def make_raw_encoder(name):
+    rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
+    assert hasattr(runicode, rname)
+    def raw_encoder(space, uni):
+        state = space.fromcache(CodecState)
+        func = getattr(runicode, rname)
+        errors = "strict"
+        return func(uni, len(uni), errors, state.encode_error_handler)
+    raw_encoder.func_name = rname
+    return raw_encoder
+
+def make_raw_decoder(name):
+    rname = "str_decode_%s" % (name.replace("_decode", ""), )
+    assert hasattr(runicode, rname)
+    def raw_decoder(space, string):
+        final = True
+        errors = "strict"
+        state = space.fromcache(CodecState)
+        func = getattr(runicode, rname)
+        kwargs = {}
+        if name == 'unicode_escape':
+            unicodedata_handler = state.get_unicodedata_handler(space)
+            result, consumed = func(string, len(string), errors,
+                                    final, state.decode_error_handler,
+                                    unicodedata_handler=unicodedata_handler)
+        else:
+            result, consumed = func(string, len(string), errors,
+                                    final, state.decode_error_handler)
+        return result
+    raw_decoder.func_name = rname
+    return raw_decoder
+
 def make_encoder_wrapper(name):
     rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
     assert hasattr(runicode, rname)
@@ -308,20 +405,26 @@
 for encoders in [
          "ascii_encode",
          "latin_1_encode",
+         "utf_7_encode",
          "utf_8_encode",
          "utf_16_encode",
          "utf_16_be_encode",
          "utf_16_le_encode",
+         "unicode_escape_encode",
+         "raw_unicode_escape_encode",
+         "unicode_internal_encode",
         ]:
     make_encoder_wrapper(encoders)
 
 for decoders in [
          "ascii_decode",
          "latin_1_decode",
+         "utf_7_decode",
          "utf_8_decode",
          "utf_16_decode",
          "utf_16_be_decode",
          "utf_16_le_decode",
+         "raw_unicode_escape_decode",
          ]:
     make_decoder_wrapper(decoders)
 
@@ -330,8 +433,6 @@
     make_decoder_wrapper('mbcs_decode')
 
 def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
-    """None
-    """
     final = space.is_true(w_final)
     state = space.fromcache(CodecState)
     if byteorder == 0:
@@ -349,77 +450,213 @@
                            space.wrap(byteorder)])
 utf_16_ex_decode.unwrap_spec = [ObjSpace, str, str, int, W_Root]
 
-def _extract_from_mapping(space, mapping_w, w_mapping, ch):
-    if mapping_w is not None:
+# ____________________________________________________________
+# Charmap
+
+class Charmap_Decode:
+    def __init__(self, space, w_mapping):
+        self.space = space
+        self.w_mapping = w_mapping
+
+        # fast path for all the stuff in the encodings module
+        if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
+            self.mapping_w = space.fixedview(w_mapping)
+        else:
+            self.mapping_w = None
+
+    def get(self, ch, errorchar):
+        space = self.space
+
+        # get the character from the mapping
+        if self.mapping_w is not None:
+            w_ch = self.mapping_w[ord(ch)]
+        else:
+            try:
+                w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+            except OperationError, e:
+                if not e.match(space, space.w_LookupError):
+                    raise
+                return errorchar
+
+        # Charmap may return a unicode string
         try:
-            return mapping_w[ord(ch)]
-        except IndexError:
-            pass
-    else:
+            x = space.unicode_w(w_ch)
+        except OperationError, e:
+            if not e.match(space, space.w_TypeError):
+                raise
+        else:
+            return x
+
+        # Charmap may return a number
+        try:
+            x = space.int_w(w_ch)
+        except OperationError:
+            if not e.match(space, space.w_TypeError):
+                raise
+        else:
+            if 0 <= x < 65536: # Even on wide unicode builds...
+                return unichr(x)
+            else:
+                raise OperationError(space.w_TypeError, space.wrap(
+                    "character mapping must be in range(65536)"))
+
+        # Charmap may return None
+        if space.is_w(w_ch, space.w_None):
+            return errorchar
+
+        raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
+
+class Charmap_Encode:
+    def __init__(self, space, w_mapping):
+        self.space = space
+        self.w_mapping = w_mapping
+
+    def get(self, ch, errorchar):
+        space = self.space
+
+        # get the character from the mapping
         try:
-            return space.getitem(w_mapping, space.newint(ord(ch)))
+            w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
         except OperationError, e:
-            if (not e.match(space, space.w_KeyError) and
-                not e.match(space, space.w_IndexError)):
+            if not e.match(space, space.w_LookupError):
                 raise
-            pass
+            return errorchar
 
-def _append_unicode(space, builder, w_x):
-    try:
-        x = space.unicode_w(w_x)
-    except OperationError, e:
-        if not e.match(space, space.w_TypeError):
-            raise
-    else:
-        if x != u"\ufffe":
-            builder.append(x)
-            return True
-        return False
-    try:
-        x = space.int_w(w_x)
-    except OperationError:
-        if not e.match(space, space.w_TypeError):
-            raise
-    else:
-        if x < 65536:
-            builder.append(unichr(x))
+        # Charmap may return a string
+        try:
+            x = space.realstr_w(w_ch)
+        except OperationError, e:
+            if not e.match(space, space.w_TypeError):
+                raise
         else:
-            raise OperationError(space.w_TypeError, space.wrap("character mapping must be in range(65536)"))
-        return True
-    if not space.is_true(w_x):
-        return False
+            return x
+
+        # Charmap may return a number
+        try:
+            x = space.int_w(w_ch)
+        except OperationError:
+            if not e.match(space, space.w_TypeError):
+                raise
+        else:
+            if 0 <= x < 256:
+                return chr(x)
+            else:
+                raise OperationError(space.w_TypeError, space.wrap(
+                    "character mapping must be in range(256)"))
+
+        # Charmap may return None
+        if space.is_w(w_ch, space.w_None):
+            return errorchar
+
+        raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
+
+
+ at unwrap_spec(ObjSpace, str, str, W_Root)
+def charmap_decode(space, string, errors="strict", w_mapping=None):
+    if len(string) == 0:
+        return space.newtuple([space.wrap(u''), space.wrap(0)])
+
+    if space.is_w(w_mapping, space.w_None):
+        mapping = None
     else:
-        raise OperationError(space.w_TypeError, space.w_None)
+        mapping = Charmap_Decode(space, w_mapping)
+
+    final = True
+    state = space.fromcache(CodecState)
+    result, consumed = runicode.str_decode_charmap(
+        string, len(string), errors,
+        final, state.decode_error_handler, mapping)
+    return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+ at unwrap_spec(ObjSpace, unicode, str, W_Root)
+def charmap_encode(space, uni, errors="strict", w_mapping=None):
+    if space.is_w(w_mapping, space.w_None):
+        mapping = None
+    else:
+        mapping = Charmap_Encode(space, w_mapping)
+
+    state = space.fromcache(CodecState)
+    result = runicode.unicode_encode_charmap(
+        uni, len(uni), errors,
+        state.encode_error_handler, mapping)
+    return space.newtuple([space.wrap(result), space.wrap(len(uni))])
 
 
-def charmap_decode(space, s, errors="strict", w_mapping=None):
-    size = len(s)
-    # Default to Latin-1
-    if space.is_true(space.is_(w_mapping, space.w_None)):
-        return latin_1_decode(space, s, errors, space.w_False)
+ at unwrap_spec(ObjSpace, unicode)
+def charmap_build(space, chars):
+    # XXX CPython sometimes uses a three-level trie
+    w_charmap = space.newdict()
+    for num in range(len(chars)):
+        elem = chars[num]
+        space.setitem(w_charmap, space.newint(ord(elem)), space.newint(num))
+    return w_charmap
 
-    if (size == 0):
+# ____________________________________________________________
+# Unicode escape
+
+class UnicodeData_Handler:
+    def __init__(self, space, w_getcode):
+        self.space = space
+        self.w_getcode = w_getcode
+
+    def call(self, name):
+        space = self.space
+        try:
+            w_code = space.call_function(self.w_getcode, space.wrap(name))
+        except OperationError, e:
+            if not e.match(space, space.w_KeyError):
+                raise
+            return -1
+        return space.int_w(w_code)
+
+ at unwrap_spec(ObjSpace, 'bufferstr', str, W_Root)
+def unicode_escape_decode(space, string, errors="strict", w_final=False):
+    final = space.is_true(w_final)
+    state = space.fromcache(CodecState)
+    errorhandler=state.decode_error_handler
+
+    unicode_name_handler = state.get_unicodedata_handler(space)
+
+    result, consumed = runicode.str_decode_unicode_escape(
+        string, len(string), errors,
+        final, state.decode_error_handler,
+        unicode_name_handler)
+
+    return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+# ____________________________________________________________
+# Unicode-internal
+
+ at unwrap_spec(ObjSpace, W_Root, str)
+def unicode_internal_decode(space, w_string, errors="strict"):
+    # special case for this codec: unicodes are returned as is
+    if space.isinstance_w(w_string, space.w_unicode):
+        return space.newtuple([w_string, space.len(w_string)])
+
+    string = space.str_w(w_string)
+
+    if len(string) == 0:
         return space.newtuple([space.wrap(u''), space.wrap(0)])
-    
-    # fast path for all the stuff in the encodings module
-    if space.is_true(space.isinstance(w_mapping, space.w_tuple)):
-        mapping_w = space.fixedview(w_mapping)
-    else:
-        mapping_w = None
-
-    builder = UnicodeBuilder(size)
-    inpos = 0
-    while (inpos < len(s)):
-        #/* Get mapping_w (char ordinal -> integer, Unicode char or None) */
-        ch = s[inpos]
-        w_x = _extract_from_mapping(space, mapping_w, w_mapping, ch)
-        if w_x is not None and _append_unicode(space, builder, w_x):
-            inpos += 1
-            continue
-        state = space.fromcache(CodecState)
-        next, inpos = state.decode_error_handler(errors, "charmap",
-                   "character maps to <undefined>", s, inpos, inpos+1)
-        builder.append(next)
-    res = builder.build()
-    return space.newtuple([space.wrap(res), space.wrap(size)])
-charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
+
+    final = True
+    state = space.fromcache(CodecState)
+    result, consumed = runicode.str_decode_unicode_internal(
+        string, len(string), errors,
+        final, state.decode_error_handler)
+    return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
+# ____________________________________________________________
+# support for the "string escape" codec
+# This is a bytes-to bytes transformation
+
+ at unwrap_spec(ObjSpace, W_Root, str)
+def escape_encode(space, w_string, errors='strict'):
+    w_repr = space.repr(w_string)
+    w_result = space.getslice(w_repr, space.wrap(1), space.wrap(-1))
+    return space.newtuple([w_result, space.len(w_string)])
+
+ at unwrap_spec(ObjSpace, str, str)
+def escape_decode(space, data, errors='strict'):
+    from pypy.interpreter.pyparser.parsestring import PyString_DecodeEscape
+    result = PyString_DecodeEscape(space, data, None)
+    return space.newtuple([space.wrap(result), space.wrap(len(data))])

Modified: pypy/trunk/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/test/test_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/test/test_codecs.py	Thu Jul  1 22:24:32 2010
@@ -1,7 +1,5 @@
 import autopath
 from pypy.conftest import gettestobjspace
-from pypy.module._codecs.app_codecs import unicode_escape_encode,\
-     charmap_encode, unicode_escape_decode
 
 
 class AppTestCodecs:
@@ -14,26 +12,16 @@
         raises(TypeError, _codecs.register, 1)
 
     def test_bigU_codecs(self):
-        import sys
-        oldmaxunicode = sys.maxunicode
-        if sys.maxunicode <= 0xffff:
-            return # this test cannot run on UCS2 builds
         u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
         for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                          'raw_unicode_escape',
                          'unicode_escape', 'unicode_internal'):
             assert unicode(u.encode(encoding),encoding) == u
-        sys.maxunicode = oldmaxunicode
 
     def test_ucs4(self):
-        import sys
-        oldmaxunicode = sys.maxunicode
-        if sys.maxunicode <= 0xffff:
-            sys.maxunicode = 0xffffffff
         x = u'\U00100000'
         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
         assert x == y 
-        sys.maxunicode = oldmaxunicode
 
     def test_named_unicode(self):
         assert unicode('\\N{SPACE}','unicode-escape') == u" "
@@ -118,12 +106,20 @@
 
     def test_charmap_decode(self):
         from _codecs import charmap_decode
+        import sys
         assert charmap_decode('', 'strict', 'blablabla') == ('', 0)
         assert charmap_decode('xxx') == ('xxx', 3)
         assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
         map = tuple([unichr(i) for i in range(256)])
         assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
 
+        raises(TypeError, charmap_decode, '\xff', "replace",  {0xff: 0x10001})
+
+    def test_unicode_escape(self):
+        from _codecs import unicode_escape_encode, unicode_escape_decode
+        assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
+        assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
+        assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
 
 class AppTestPartialEvaluation:
 
@@ -377,6 +373,9 @@
 
     def test_charmap_decode_1(self):
         import codecs
+        assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
+        assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+
         res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
         assert res == (u"ab\ufffd", 3)
         res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
@@ -464,6 +463,9 @@
         assert '\xff'.decode('utf-7', 'ignore') == ''
         assert '\x00'.decode('unicode-internal', 'ignore') == ''
 
+    def test_backslahreplace(self):
+        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000'
+
     def test_badhandler(self):
         import codecs
         results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
@@ -527,9 +529,26 @@
     def test_charmap_encode(self):
         assert 'xxx'.encode('charmap') == 'xxx'
 
+        import codecs
+        raises(TypeError, codecs.charmap_encode, u'\xff', "replace",  {0xff: 300})
+        raises(UnicodeError, codecs.charmap_encode, u"\xff", "replace", {0xff: None})
+
+    def test_charmap_encode_replace(self):
+        charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
+        charmap[ord("?")] = "XYZ"
+        import codecs
+        sin = u"abcDEF"
+        sout = codecs.charmap_encode(sin, "replace", charmap)[0]
+        assert sout == "AABBCCXYZXYZXYZ"
+
     def test_charmap_decode_2(self):
         assert 'foo'.decode('charmap') == 'foo'
 
+    def test_charmap_build(self):
+        import codecs
+        assert codecs.charmap_build(u'123456') == {49: 0, 50: 1, 51: 2,
+                                                   52: 3, 53: 4, 54: 5}
+
     def test_utf7_start_end_in_exception(self):
         try:
             '+IC'.decode('utf-7')
@@ -537,6 +556,9 @@
             assert exc.start == 0
             assert exc.end == 3
 
+    def test_utf7_surrogate(self):
+        raises(UnicodeDecodeError, '+3ADYAA-'.decode, 'utf-7')
+
     def test_utf_16_encode_decode(self):
         import codecs
         x = u'123abc'
@@ -546,6 +568,8 @@
     def test_unicode_escape(self):        
         assert u'\\'.encode('unicode-escape') == '\\\\'
         assert '\\\\'.decode('unicode-escape') == u'\\'
+        assert u'\ud801'.encode('unicode-escape') == '\\ud801'
+        assert u'\u0013'.encode('unicode-escape') == '\\x13'
 
     def test_mbcs(self):
         import sys
@@ -555,14 +579,3 @@
         assert u'caf\xe9'.encode('mbcs') == 'caf\xe9'
         assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter
         assert 'cafx\e9'.decode('mbcs') == u'cafx\e9'
-
-
-class TestDirect:
-    def test_charmap_encode(self):
-        assert charmap_encode(u'xxx') == ('xxx', 3)
-        assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) ==  ('XXXXXX', 6)
-
-    def test_unicode_escape(self):
-        assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
-        assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
-        assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)

Modified: pypy/trunk/pypy/objspace/std/marshal_impl.py
==============================================================================
--- pypy/trunk/pypy/objspace/std/marshal_impl.py	(original)
+++ pypy/trunk/pypy/objspace/std/marshal_impl.py	Thu Jul  1 22:24:32 2010
@@ -447,11 +447,11 @@
 register(TYPE_CODE, unmarshal_pycode)
 
 def marshal_w__Unicode(space, w_unicode, m):
-    s = space.str_w(unicodehelper.PyUnicode_EncodeUTF8(space, w_unicode))
+    s = unicodehelper.PyUnicode_EncodeUTF8(space, space.unicode_w(w_unicode))
     m.atom_str(TYPE_UNICODE, s)
 
 def unmarshal_Unicode(space, u, tc):
-    return unicodehelper.PyUnicode_DecodeUTF8(space, space.wrap(u.get_str()))
+    return space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, u.get_str()))
 register(TYPE_UNICODE, unmarshal_Unicode)
 
 app = gateway.applevel(r'''

Modified: pypy/trunk/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/trunk/pypy/objspace/std/unicodeobject.py	(original)
+++ pypy/trunk/pypy/objspace/std/unicodeobject.py	Thu Jul  1 22:24:32 2010
@@ -12,6 +12,7 @@
 from pypy.rlib.rarithmetic import intmask, ovfcheck
 from pypy.rlib.objectmodel import compute_hash
 from pypy.rlib.rstring import string_repeat
+from pypy.rlib.runicode import unicode_encode_unicode_escape
 from pypy.module.unicodedata import unicodedb_4_1_0 as unicodedb
 from pypy.tool.sourcetools import func_with_new_name
 
@@ -892,101 +893,11 @@
                     space.wrap("character mapping must return integer, None or unicode"))
     return W_UnicodeObject(u''.join(result))
 
-# Move this into the _codecs module as 'unicodeescape_string (Remember to cater for quotes)'
 def repr__Unicode(space, w_unicode):
-    hexdigits = "0123456789abcdef"
     chars = w_unicode._value
     size = len(chars)
-    
-    singlequote = doublequote = False
-    for c in chars:
-        if c == u'\'':
-            singlequote = True
-        elif c == u'"':
-            doublequote = True
-    if singlequote and not doublequote:
-        quote = '"'
-    else:
-        quote = '\''
-    result = ['u', quote]
-    j = 0
-    while j<len(chars):
-        ch = chars[j]
-        code = ord(ch)
-        if code >= 0x10000:
-            # Resize if needed
-            result.extend(['\\', "U",
-                           hexdigits[(code >> 28) & 0xf],
-                           hexdigits[(code >> 24) & 0xf],
-                           hexdigits[(code >> 20) & 0xf],
-                           hexdigits[(code >> 16) & 0xf],
-                           hexdigits[(code >> 12) & 0xf],
-                           hexdigits[(code >>  8) & 0xf],
-                           hexdigits[(code >>  4) & 0xf],
-                           hexdigits[(code >>  0) & 0xf],
-                           ])
-            j += 1
-            continue
-        if code >= 0xD800 and code < 0xDC00:
-            if j < size - 1:
-                ch2 = chars[j+1]
-                code2 = ord(ch2)
-                if code2 >= 0xDC00 and code2 <= 0xDFFF:
-                    code = (((code & 0x03FF) << 10) | (code2 & 0x03FF)) + 0x00010000
-                    result.extend(['\\', "U",
-                                   hexdigits[(code >> 28) & 0xf],
-                                   hexdigits[(code >> 24) & 0xf],
-                                   hexdigits[(code >> 20) & 0xf],
-                                   hexdigits[(code >> 16) & 0xf],
-                                   hexdigits[(code >> 12) & 0xf],
-                                   hexdigits[(code >>  8) & 0xf],
-                                   hexdigits[(code >>  4) & 0xf],
-                                   hexdigits[(code >>  0) & 0xf],
-                                  ])
-                    j += 2
-                    continue
-                
-        if code >= 0x100:
-            result.extend(['\\', "u",
-                           hexdigits[(code >> 12) & 0xf],
-                           hexdigits[(code >>  8) & 0xf],
-                           hexdigits[(code >>  4) & 0xf],
-                           hexdigits[(code >>  0) & 0xf],
-                          ])
-            j += 1
-            continue
-        if code == ord('\\') or code == ord(quote):
-            result.append('\\')
-            result.append(chr(code))
-            j += 1
-            continue
-        if code == ord('\t'):
-            result.append('\\')
-            result.append('t')
-            j += 1
-            continue
-        if code == ord('\r'):
-            result.append('\\')
-            result.append('r')
-            j += 1
-            continue
-        if code == ord('\n'):
-            result.append('\\')
-            result.append('n')
-            j += 1
-            continue
-        if code < ord(' ') or code >= 0x7f:
-            result.extend(['\\', "x",
-                           hexdigits[(code >> 4) & 0xf], 
-                           hexdigits[(code >> 0) & 0xf],
-                          ])
-            j += 1
-            continue
-        result.append(chr(code))
-        j += 1
-    result.append(quote)
-    return space.wrap(''.join(result))
-        
+    s = unicode_encode_unicode_escape(chars, size, "strict", quotes=True)
+    return space.wrap(s)
 
 def mod__Unicode_ANY(space, w_format, w_values):
     return mod_format(space, w_format, w_values, do_unicode=True)

Modified: pypy/trunk/pypy/rlib/rstring.py
==============================================================================
--- pypy/trunk/pypy/rlib/rstring.py	(original)
+++ pypy/trunk/pypy/rlib/rstring.py	Thu Jul  1 22:24:32 2010
@@ -56,6 +56,7 @@
         self.l.append(s)
 
     def append_slice(self, s, start, end):
+        assert 0 <= start <= end <= len(s)
         self.l.append(s[start:end])
 
     def append_multiple_char(self, c, times):

Modified: pypy/trunk/pypy/rlib/runicode.py
==============================================================================
--- pypy/trunk/pypy/rlib/runicode.py	(original)
+++ pypy/trunk/pypy/rlib/runicode.py	Thu Jul  1 22:24:32 2010
@@ -1,7 +1,9 @@
 import sys
 from pypy.rlib.bitmanipulation import splitter
 from pypy.rpython.lltypesystem import lltype, rffi
-from pypy.rlib.objectmodel import we_are_translated
+from pypy.rlib.objectmodel import we_are_translated, specialize
+from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
+from pypy.rlib.rarithmetic import r_uint
 
 if rffi.sizeof(lltype.UniChar) == 4:
     MAXUNICODE = 0x10ffff
@@ -42,8 +44,6 @@
     UNICHR = unichr
     ORD = ord
 
-# XXX review the functions below and think about using stringbuilders for them
-
 
 def raise_unicode_exception_decode(errors, encoding, msg, s,
                                    startingpos, endingpos):
@@ -55,8 +55,8 @@
     assert isinstance(u, unicode)
     raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
 
-# ____________________________________________________________ 
-# unicode decoding
+# ____________________________________________________________
+# utf-8
 
 utf8_code_length = [
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -81,9 +81,10 @@
                      errorhandler=None):
     if errorhandler is None:
         errorhandler = raise_unicode_exception_decode
-    if (size == 0):
+    if size == 0:
         return u'', 0
-    result = []
+
+    result = UnicodeBuilder(size)
     pos = 0
     while pos < size:
         ch = s[pos]
@@ -94,14 +95,14 @@
             continue
 
         n = utf8_code_length[ordch1]
-        if (pos + n > size):
+        if pos + n > size:
             if not final:
                 break
             else:
                 r, pos = errorhandler(errors, "utf-8",
                                       "unexpected end of data", s,  pos, size)
                 result.append(r)
-                if (pos + n > size):
+                if pos + n > size:
                     break
         if n == 0:
             r, pos = errorhandler(errors, "utf-8", "unexpected code byte",
@@ -116,7 +117,7 @@
             z, two = splitter[6, 2](ordch2)
             y, six = splitter[5, 3](ordch1)
             assert six == 6
-            if (two != 2):
+            if two != 2:
                 r, pos = errorhandler(errors, "utf-8", "invalid data",
                                       s,  pos, pos + 2)
                 result.append(r)
@@ -137,7 +138,7 @@
             y, two2 = splitter[6, 2](ordch2)
             x, fourteen = splitter[4, 4](ordch1)
             assert fourteen == 14
-            if (two1 != 2 or two2 != 2):
+            if two1 != 2 or two2 != 2:
                 r, pos = errorhandler(errors, "utf-8", "invalid data",
                                       s,  pos, pos + 3)
                 result.append(r)
@@ -166,7 +167,7 @@
             x, two3 = splitter[6, 2](ordch2)
             w, thirty = splitter[3, 5](ordch1)
             assert thirty == 30
-            if (two1 != 2 or two2 != 2 or two3 != 2):
+            if two1 != 2 or two2 != 2 or two3 != 2:
                 r, pos = errorhandler(errors, "utf-8", "invalid data",
                                       s,  pos, pos + 4)
                 result.append(r)
@@ -174,7 +175,7 @@
                 c = (w << 18) + (x << 12) + (y << 6) + z
                 # minimum value allowed for 4 byte encoding
                 # maximum value allowed for UTF-16
-                if ((c < 0x10000) or (c > 0x10ffff)):
+                if c < 0x10000 or c > 0x10ffff:
                     r, pos = errorhandler(errors, "utf-8", "illegal encoding",
                                           s,  pos, pos + 4)
                     result.append(r)
@@ -197,8 +198,53 @@
                                   s,  pos, pos + n)
             result.append(r)
 
-    return u"".join(result), pos
+    return result.build(), pos
 
+def _encodeUCS4(result, ch):
+    # Encode UCS4 Unicode ordinals
+    result.append((chr((0xf0 | (ch >> 18)))))
+    result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
+    result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+    result.append((chr((0x80 | (ch & 0x3f)))))
+
+def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+    assert(size >= 0)
+    result = StringBuilder(size)
+    i = 0
+    while i < size:
+        ch = ord(s[i])
+        i += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        else:
+            # Encode UCS2 Unicode ordinals
+            if ch < 0x10000:
+                # Special case: check for high surrogate
+                if 0xD800 <= ch <= 0xDBFF and i != size:
+                    ch2 = ord(s[i])
+                    # Check for low surrogate and combine the two to
+                    # form a UCS4 value
+                    if 0xDC00 <= ch2 <= 0xDFFF:
+                        ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+                        i += 1
+                        _encodeUCS4(result, ch3)
+                        continue
+                # Fall through: handles isolated high surrogates
+                result.append((chr((0xe0 | (ch >> 12)))))
+                result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+                result.append((chr((0x80 | (ch & 0x3f)))))
+                continue
+            else:
+                _encodeUCS4(result, ch)
+    return result.build()
+
+# ____________________________________________________________
+# utf-16
 
 def str_decode_utf_16(s, size, errors, final=True,
                       errorhandler=None):
@@ -238,12 +284,11 @@
     #  mark is skipped, in all other modes, it is copied to the output
     #  stream as-is (giving a ZWNBSP character).
     pos = 0
-    result = []
     if byteorder == 'native':
-        if (size >= 2):
+        if size >= 2:
             bom = (ord(s[ihi]) << 8) | ord(s[ilo])
             if BYTEORDER == 'little':
-                if (bom == 0xFEFF):
+                if bom == 0xFEFF:
                     pos += 2
                     bo = -1
                 elif bom == 0xFFFE:
@@ -260,20 +305,22 @@
         bo = -1
     else:
         bo = 1
-    if (size == 0):
+    if size == 0:
         return u'', 0, bo
-    if (bo == -1):
+    if bo == -1:
         # force little endian
         ihi = 1
         ilo = 0
 
-    elif (bo == 1):
+    elif bo == 1:
         # force big endian
         ihi = 0
         ilo = 1
 
+    result = UnicodeBuilder(size // 2)
+
     #XXX I think the errors are not correctly handled here
-    while (pos < len(s)):
+    while pos < size:
         # remaining bytes at the end? (size should be even)
         if len(s) - pos < 2:
             if not final:
@@ -285,7 +332,7 @@
                 break
         ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
         pos += 2
-        if (ch < 0xD800 or ch > 0xDFFF):
+        if ch < 0xD800 or ch > 0xDFFF:
             result.append(unichr(ch))
             continue
         # UTF-16 code pair:
@@ -297,10 +344,10 @@
             result.append(r)
             if len(s) - pos < 2:
                 break
-        elif (0xD800 <= ch and ch <= 0xDBFF):
+        elif 0xD800 <= ch <= 0xDBFF:
             ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
             pos += 2
-            if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
+            if 0xDC00 <= ch2 <= 0xDFFF:
                 if MAXUNICODE < 65536:
                     result.append(unichr(ch))
                     result.append(unichr(ch2))
@@ -318,17 +365,305 @@
                                   "illegal encoding",
                                   s, pos - 2, pos)
             result.append(r)
-    return u"".join(result), pos, bo
+    return result.build(), pos, bo
+
+def _STORECHAR(result, CH, byteorder):
+    hi = chr(((CH) >> 8) & 0xff)
+    lo = chr((CH) & 0xff)
+    if byteorder == 'little':
+        result.append(lo)
+        result.append(hi)
+    else:
+        result.append(hi)
+        result.append(lo)
+
+def unicode_encode_utf_16_helper(s, size, errors,
+                                 errorhandler=None,
+                                 byteorder='little'):
+    if size == 0:
+        return ""
+
+    result = StringBuilder(size * 2 + 2)
+    if byteorder == 'native':
+        _STORECHAR(result, 0xFEFF, BYTEORDER)
+        byteorder = BYTEORDER
+
+    i = 0
+    while i < size:
+        ch = ord(s[i])
+        i += 1
+        ch2 = 0
+        if ch >= 0x10000:
+            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
+            ch  = 0xD800 | ((ch-0x10000) >> 10)
+
+        _STORECHAR(result, ch, byteorder)
+        if ch2:
+            _STORECHAR(result, ch2, byteorder)
+
+    return result.build()
+
+def unicode_encode_utf_16(s, size, errors,
+                          errorhandler=None):
+    return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
+
+
+def unicode_encode_utf_16_be(s, size, errors,
+                             errorhandler=None):
+    return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
+
+
+def unicode_encode_utf_16_le(s, size, errors,
+                             errorhandler=None):
+    return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
+
+
+# ____________________________________________________________
+# utf-7
+
+## indicate whether a UTF-7 character is special i.e. cannot be directly
+##       encoded:
+##         0 - not special
+##         1 - special
+##         2 - whitespace (optional)
+##         3 - RFC2152 Set O (optional)
+
+_utf7_special = [
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+]
+
+def _utf7_SPECIAL(oc, encodeO=False, encodeWS=False):
+    return (oc > 127 or _utf7_special[oc] == 1 or
+            (encodeWS and _utf7_special[oc] == 2) or
+            (encodeO and _utf7_special[oc] == 3))
+
+def _utf7_B64CHAR(oc):
+    if oc > 127:
+        return False
+    c = chr(oc)
+    return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+    "Returns the base-64 character of the bottom 6 bits of n"
+    return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+    "Retuns the base-64 value of a base-64 character"
+    if c == '+':
+        return 62
+    elif c == '/':
+        return 63
+    elif c >= 'a':
+        return ord(c) - 71
+    elif c >= 'A':
+        return ord(c) - 65
+    else:
+        return ord(c) + 4
+
+def _utf7_ENCODE(result, ch, bits):
+    while bits >= 6:
+        result.append(_utf7_TO_BASE64(ch >> (bits - 6)))
+        bits -= 6
+    return bits
+
+def _utf7_DECODE(s, result, errorhandler, errors,
+                 pos, charsleft, bitsleft, surrogate):
+    while bitsleft >= 16:
+        outCh =  (charsleft >> (bitsleft-16)) & 0xffff
+        bitsleft -= 16
+
+        if surrogate:
+            ## We have already generated an error for the high
+            ## surrogate so let's not bother seeing if the low
+            ## surrogate is correct or not
+            surrogate = False
+        elif 0xDC00 <= outCh <= 0xDFFF:
+            ## This is a surrogate pair. Unfortunately we can't
+            ## represent it in a 16-bit character
+            surrogate = True
+            msg = "code pairs are not supported"
+            res, pos = errorhandler(errors, 'utf-7',
+                                    msg, s, pos-1, pos)
+            result.append(res)
+            bitsleft = 0
+            break
+        else:
+            result.append(unichr(outCh))
+    return pos, charsleft, bitsleft, surrogate
+
+
+def str_decode_utf_7(s, size, errors, final=False,
+                     errorhandler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+    if size == 0:
+        return u'', 0
+
+    inShift = False
+    bitsleft = 0
+    startinpos = 0
+    charsleft = 0
+    surrogate = False
+
+    result = UnicodeBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+        oc = ord(ch)
+
+        if inShift:
+            if ch == '-' or not _utf7_B64CHAR(oc):
+                inShift = 0
+                pos += 1
+
+                pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
+                    s, result, errorhandler, errors,
+                    pos, charsleft, bitsleft, surrogate)
+                if bitsleft >= 6:
+                    ## The shift sequence has a partial character in it. If
+                    ## bitsleft < 6 then we could just classify it as padding
+                    ## but that is not the case here
+                    msg = "partial character in shift sequence"
+                    res, pos = errorhandler(errors, 'utf-7',
+                                            msg, s, pos-1, pos)
+                    result.append(res)
+                    ## According to RFC2152 the remaining bits should be
+                    ## zero. We choose to signal an error/insert a replacement
+                    ## character here so indicate the potential of a
+                    ## misencoded character.
+                if ch == '-':
+                    if pos < size and s[pos] == '-':
+                        result.append(u'-')
+                        inShift = True
+
+                elif _utf7_SPECIAL(oc):
+                    msg = "unexpected special character"
+                    res, pos = errorhandler(errors, 'utf-7',
+                                            msg, s, pos-1, pos)
+                    result.append(res)
+                else:
+                    result.append(unichr(ord(ch)))
+            else:
+                charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
+                bitsleft += 6
+                pos += 1
+
+                pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
+                    s, result, errorhandler, errors,
+                    pos, charsleft, bitsleft, surrogate)
+        elif ch == '+':
+            startinpos = pos
+            pos += 1
+            if pos < size and s[pos] == '-':
+                pos += 1
+                result.append(u'+')
+            else:
+                inShift = 1
+                bitsleft = 0
+
+        elif _utf7_SPECIAL(oc):
+            pos += 1
+            msg = "unexpected special character"
+            res, pos = errorhandler(errors, 'utf-7', msg, s, pos-1, pos)
+            result.append(res)
+        else:
+            result.append(unichr(oc))
+            pos += 1
+
+    if inShift:
+        endinpos = size
+        msg = "unterminated shift sequence"
+        res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
+        result.append(res)
+
+    return result.build(), pos
+
+def unicode_encode_utf_7(s, size, errors, errorhandler=None):
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+
+    encodeSetO = encodeWhiteSpace = False
+
+    inShift = False
+    bitsleft = 0
+    charsleft = 0
+
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+        oc = ord(ch)
+        if not inShift:
+            if ch == u'+':
+                result.append('+-')
+            elif _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+                charsleft = oc
+                bitsleft = 16
+                result.append('+')
+                bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
+                inShift = bitsleft > 0
+            else:
+                result.append(chr(oc))
+        else:
+            if not _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+                result.append(_utf7_TO_BASE64(charsleft << (6-bitsleft)))
+                charsleft = 0
+                bitsleft = 0
+                ## Characters not in the BASE64 set implicitly unshift the
+                ## sequence so no '-' is required, except if the character is
+                ## itself a '-'
+                if _utf7_B64CHAR(oc) or ch == u'-':
+                    result.append('-')
+                inShift = False
+                result.append(chr(oc))
+            else:
+                bitsleft += 16
+                charsleft = (charsleft << 16) | oc
+                bitsleft =  _utf7_ENCODE(result, charsleft, bitsleft)
+                ## If the next character is special then we dont' need to
+                ## terminate the shift sequence. If the next character is not
+                ## a BASE64 character or '-' then the shift sequence will be
+                ## terminated implicitly and we don't have to insert a '-'.
+                if bitsleft == 0:
+                    if pos + 1 < size:
+                        ch2 = s[pos + 1]
+                        oc2 = ord(ch2)
+
+                        if _utf7_SPECIAL(oc2, encodeSetO, encodeWhiteSpace):
+                            pass
+                        elif _utf7_B64CHAR(oc2) or ch2 == u'-':
+                            result.append('-')
+                            inShift = False
+                        else:
+                            inShift = False
+                    else:
+                        result.append('-')
+                        inShift = False
+        pos += 1
+
+    if bitsleft:
+        result.append(_utf7_TO_BASE64(charsleft << (6 - bitsleft)))
+        result.append('-')
+
+    return result.build()
+
+# ____________________________________________________________
+# ascii and latin-1
 
 def str_decode_latin_1(s, size, errors, final=False,
                        errorhandler=None):
     # latin1 is equivalent to the first 256 ordinals in Unicode.
     pos = 0
-    result = []
-    while (pos < size):
+    result = UnicodeBuilder(size)
+    while pos < size:
         result.append(unichr(ord(s[pos])))
         pos += 1
-    return u"".join(result), pos
+    return result.build(), pos
 
 
 def str_decode_ascii(s, size, errors, final=False,
@@ -336,9 +671,9 @@
     if errorhandler is None:
         errorhandler = raise_unicode_exception_decode
     # ASCII is equivalent to the first 128 ordinals in Unicode.
-    result = []
+    result = UnicodeBuilder(size)
     pos = 0
-    while pos < len(s):
+    while pos < size:
         c = s[pos]
         if ord(c) < 128:
             result.append(unichr(ord(c)))
@@ -347,55 +682,7 @@
             r, pos = errorhandler(errors, "ascii", "ordinal not in range(128)",
                                   s,  pos, pos + 1)
             result.append(r)
-    return u"".join(result), pos
-
-
-# ____________________________________________________________ 
-# unicode encoding 
-
-
-def unicode_encode_utf_8(s, size, errors, errorhandler=None):
-    assert(size >= 0)
-    result = []
-    i = 0
-    while i < size:
-        ch = ord(s[i])
-        i += 1
-        if (ch < 0x80):
-            # Encode ASCII 
-            result.append(chr(ch))
-        elif (ch < 0x0800) :
-            # Encode Latin-1 
-            result.append(chr((0xc0 | (ch >> 6))))
-            result.append(chr((0x80 | (ch & 0x3f))))
-        else:
-            # Encode UCS2 Unicode ordinals
-            if (ch < 0x10000):
-                # Special case: check for high surrogate
-                if (0xD800 <= ch and ch <= 0xDBFF and i != size) :
-                    ch2 = ord(s[i])
-                    # Check for low surrogate and combine the two to
-                    # form a UCS4 value
-                    if (0xDC00 <= ch2 and ch2 <= 0xDFFF) :
-                        ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
-                        i += 1
-                        _encodeUCS4(result, ch3)
-                        continue
-                # Fall through: handles isolated high surrogates
-                result.append((chr((0xe0 | (ch >> 12)))))
-                result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
-                result.append((chr((0x80 | (ch & 0x3f)))))
-                continue
-            else:
-                _encodeUCS4(result, ch)
-    return "".join(result)
-
-def _encodeUCS4(result, ch):
-    # Encode UCS4 Unicode ordinals
-    result.append((chr((0xf0 | (ch >> 18)))))
-    result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
-    result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
-    result.append((chr((0x80 | (ch & 0x3f)))))
+    return result.build(), pos
 
 
 def unicode_encode_ucs1_helper(p, size, errors,
@@ -408,12 +695,12 @@
     else:
         reason = "ordinal not in range(128)"
         encoding = "ascii"
-    
-    if (size == 0):
+
+    if size == 0:
         return ''
-    result = []
+    result = StringBuilder(size)
     pos = 0
-    while pos < len(p):
+    while pos < size:
         ch = p[pos]
         
         if ord(ch) < limit:
@@ -427,9 +714,9 @@
                 collend += 1
             r, pos = errorhandler(errors, encoding, reason, p,
                                   collstart, collend)
-            result += r   # extend 'result' as a list of characters
+            result.append(r)
     
-    return "".join(result)
+    return result.build()
 
 def unicode_encode_latin_1(p, size, errors, errorhandler=None):
     res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 256)
@@ -439,57 +726,479 @@
     res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 128)
     return res
 
+# ____________________________________________________________
+# Charmap
 
-def _STORECHAR(result, CH, byteorder):
-    hi = chr(((CH) >> 8) & 0xff)
-    lo = chr((CH) & 0xff)
-    if byteorder == 'little':
-        result.append(lo)
-        result.append(hi)
+ERROR_CHAR = u'\ufffe'
+
+ at specialize.argtype(5)
+def str_decode_charmap(s, size, errors, final=False,
+                       errorhandler=None, mapping=None):
+    "mapping can be a rpython dictionary, or a dict-like object."
+
+    # Default to Latin-1
+    if mapping is None:
+        return str_decode_latin_1(s, size, errors, final=final,
+                                  errorhandler=errorhandler)
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+    if size == 0:
+        return u'', 0
+
+    pos = 0
+    result = UnicodeBuilder(size)
+    while pos < size:
+        ch = s[pos]
+
+        c = mapping.get(ch, ERROR_CHAR)
+        if c == ERROR_CHAR:
+            r, pos = errorhandler(errors, "charmap",
+                                  "character maps to <undefined>",
+                                  s,  pos, pos + 1)
+            result.append(r)
+            continue
+        result.append(c)
+        pos += 1
+    return result.build(), pos
+
+def unicode_encode_charmap(s, size, errors, errorhandler=None,
+                           mapping=None):
+    if mapping is None:
+        return unicode_encode_latin_1(s, size, errors,
+                                      errorhandler=errorhandler)
+
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_encode
+
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        c = mapping.get(ch, '')
+        if len(c) == 0:
+            res, pos = errorhandler(errors, "charmap",
+                                    "character maps to <undefined>",
+                                    s, pos, pos + 1)
+            for ch2 in res:
+                c2 = mapping.get(unichr(ord(ch2)), '')
+                if len(c2) == 0:
+                    errorhandler(
+                        "strict", "charmap",
+                        "character maps to <undefined>",
+                        s,  pos, pos + 1)
+                result.append(c2)
+            continue
+        result.append(c)
+        pos += 1
+    return result.build()
+
+# ____________________________________________________________
+# Unicode escape
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits,
+              encoding, errorhandler, message, errors):
+    import sys
+    chr = 0
+    if pos + digits > len(s):
+        message = "end of string in escape sequence"
+        res, pos = errorhandler(errors, "unicodeescape",
+                                message, s, pos-2, len(s))
+        builder.append(res)
     else:
-        result.append(hi)
-        result.append(lo)
+        try:
+            chr = r_uint(int(s[pos:pos+digits], 16))
+        except ValueError:
+            endinpos = pos
+            while s[endinpos] in hexdigits:
+                endinpos += 1
+            res, pos = errorhandler(errors, encoding,
+                                    message, s, pos-2, endinpos+1)
+            builder.append(res)
+        else:
+            # when we get here, chr is a 32-bit unicode character
+            if chr <= MAXUNICODE:
+                builder.append(unichr(chr))
+                pos += digits
+
+            elif chr <= 0x10ffff:
+                chr -= 0x10000L
+                builder.append(unichr(0xD800 + (chr >> 10)))
+                builder.append(unichr(0xDC00 +  (chr & 0x03FF)))
+                pos += digits
+            else:
+                message = "illegal Unicode character"
+                res, pos = errorhandler(errors, encoding,
+                                        message, s, pos-2, pos+digits)
+                builder.append(res)
+    return pos
+
+def str_decode_unicode_escape(s, size, errors, final=False,
+                              errorhandler=False,
+                              unicodedata_handler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
 
-def unicode_encode_utf_16_helper(s, size, errors,
-                                 errorhandler=None,
-                                 byteorder='little'):
-    result = []
-    if (byteorder == 'native'):
-        _STORECHAR(result, 0xFEFF, BYTEORDER)
-        byteorder = BYTEORDER
-        
     if size == 0:
-        return ""
+        return u'', 0
 
-    i = 0
-    while i < size:
-        ch = ord(s[i])
-        i += 1
-        ch2 = 0
-        if (ch >= 0x10000) :
-            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
-            ch  = 0xD800 | ((ch-0x10000) >> 10)
+    builder = UnicodeBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
 
-        _STORECHAR(result, ch, byteorder)
-        if ch2:
-            _STORECHAR(result, ch2, byteorder)
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            builder.append(unichr(ord(ch)))
+            pos += 1
+            continue
 
-    return "".join(result)
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-1, size)
+            builder.append(res)
+            continue
 
-def unicode_encode_utf_16(s, size, errors,
-                          errorhandler=None):
-    return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
+        ch = s[pos]
+        pos += 1
+        # \x escapes
+        if ch == '\n': pass
+        elif ch == '\\': builder.append(u'\\')
+        elif ch == '\'': builder.append(u'\'')
+        elif ch == '\"': builder.append(u'\"')
+        elif ch == 'b' : builder.append(u'\b')
+        elif ch == 'f' : builder.append(u'\f')
+        elif ch == 't' : builder.append(u'\t')
+        elif ch == 'n' : builder.append(u'\n')
+        elif ch == 'r' : builder.append(u'\r')
+        elif ch == 'v' : builder.append(u'\v')
+        elif ch == 'a' : builder.append(u'\a')
+        elif '0' <= ch <= '7':
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x<<3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x<<3) + ord(ch) - ord('0')
+            builder.append(unichr(x))
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \uXXXX
+        elif ch == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \N{name}
+        elif ch == 'N':
+            message = "malformed \\N character escape"
+            look = pos
+            if unicodedata_handler is None:
+                message = ("\\N escapes not supported "
+                           "(can't load unicodedata module)")
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, size)
+                builder.append(res)
+                continue
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while look < size and s[look] != '}':
+                    look += 1
+                if look < size and s[look] == '}':
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos+1:look]
+                    code = unicodedata_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(errors, "unicodeescape",
+                                                message, s, pos-1, look+1)
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    if code <= MAXUNICODE:
+                        builder.append(unichr(code))
+                    else:
+                        code -= 0x10000L
+                        builder.append(unichr(0xD800 + (code >> 10)))
+                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
+                else:
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos-1, look+1)
+                    builder.append(res)
+            else:
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, look+1)
+                builder.append(res)
+        else:
+            builder.append(u'\\')
+            builder.append(unichr(ord(ch)))
 
+    return builder.build(), pos
 
-def unicode_encode_utf_16_be(s, size, errors,
-                             errorhandler=None):
-    return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
+def unicode_encode_unicode_escape(s, size, errors, errorhandler=None, quotes=False):
+    # errorhandler is not used: this function cannot cause Unicode errors
+    result = StringBuilder(size)
+
+    if quotes:
+        if s.find(u'\'') != -1 and s.find(u'\"') == -1:
+            quote = ord('\"')
+            result.append('u"')
+        else:
+            quote = ord('\'')
+            result.append('u\'')
+    else:
+        quote = 0
 
+        if size == 0:
+            return ''
 
-def unicode_encode_utf_16_le(s, size, errors,
-                             errorhandler=None):
-    return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+        oc = ord(ch)
+
+        # Escape quotes
+        if quotes and (oc == quote or ch == '\\'):
+            result.append('\\')
+            result.append(chr(oc))
+            pos += 1
+            continue
+
+        if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+            # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+            pos += 1
+            oc2 = ord(s[pos])
+
+            if 0xDC00 <= oc2 <= 0xDFFF:
+                ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+                raw_unicode_escape_helper(result, ucs)
+                pos += 1
+                continue
+            # Fall through: isolated surrogates are copied as-is
+            pos -= 1
+
+        # Map special whitespace to '\t', \n', '\r'
+        if ch == '\t':
+            result.append('\\t')
+        elif ch == '\n':
+            result.append('\\n')
+        elif ch == '\r':
+            result.append('\\r')
+        elif ch == '\\':
+            result.append('\\\\')
+
+        # Map non-printable or non-ascii to '\xhh' or '\uhhhh'
+        elif oc < 32 or oc >= 0x7F:
+            raw_unicode_escape_helper(result, oc)
+
+        # Copy everything else as-is
+        else:
+            result.append(chr(oc))
+        pos += 1
+
+    if quotes:
+        result.append(chr(quote))
+    return result.build()
+
+# ____________________________________________________________
+# Raw unicode escape
+
+def str_decode_raw_unicode_escape(s, size, errors, final=False,
+                                  errorhandler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+    if size == 0:
+        return u'', 0
+
+    result = UnicodeBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            result.append(unichr(ord(ch)))
+            pos += 1
+            continue
+
+        startinpos = pos
+        # \u-escapes are only interpreted iff the number of leading
+        # backslashes is odd
+        bs = pos
+        while pos < size:
+            pos += 1
+            if pos == size or s[pos] != '\\':
+                break
+            result.append(u'\\')
+
+        # we have a backslash at the end of the string, stop here
+        if pos >= size:
+            result.append(u'\\')
+            break
+
+        if ((pos - bs) & 1 == 0 or
+            pos >= size or
+            (s[pos] != 'u' and s[pos] != 'U')):
+            result.append(u'\\')
+            result.append(unichr(ord(s[pos])))
+            pos += 1
+            continue
+
+        if s[pos] == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+        else:
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+        pos += 1
+        pos = hexescape(result, s, pos, digits,
+                        "rawunicodeescape", errorhandler, message, errors)
+
+    return result.build(), pos
+
+def raw_unicode_escape_helper(result, char):
+    num = hex(char)
+    if char >= 0x10000:
+        result.append("\\U")
+        zeros = 8
+    elif char >= 0x100:
+        result.append("\\u")
+        zeros = 4
+    else:
+        result.append("\\x")
+        zeros = 2
+    lnum = len(num)
+    nb = zeros + 2 - lnum # num starts with '0x'
+    if nb > 0:
+        result.append_multiple_char('0', nb)
+    result.append_slice(num, 2, lnum)
+
+def unicode_encode_raw_unicode_escape(s, size, errors, errorhandler=None):
+    # errorhandler is not used: this function cannot cause Unicode errors
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        oc = ord(s[pos])
+        if oc < 0x100:
+            result.append(chr(oc))
+        else:
+            raw_unicode_escape_helper(result, oc)
+        pos += 1
+
+    return result.build()
+
+# ____________________________________________________________
+# unicode-internal
+
+def str_decode_unicode_internal(s, size, errors, final=False,
+                                errorhandler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+    if size == 0:
+        return u'', 0
+
+    if MAXUNICODE < 65536:
+        unicode_bytes = 2
+    else:
+        unicode_bytes = 4
+    if BYTEORDER == "little":
+        start = 0
+        stop = unicode_bytes
+        step = 1
+    else:
+        start = unicode_bytes - 1
+        stop = -1
+        step = -1
+
+    result = UnicodeBuilder(size // unicode_bytes)
+    pos = 0
+    while pos < size:
+        if pos > size - unicode_bytes:
+            res, pos = errorhandler(errors, "unicode_internal",
+                                    "truncated input",
+                                    s, pos, size)
+            result.append(res)
+            if pos > size - unicode_bytes:
+                break
+            continue
+        t = r_uint(0)
+        h = 0
+        for j in range(start, stop, step):
+            t += r_uint(ord(s[pos + j])) << (h*8)
+            h += 1
+        if t > MAXUNICODE:
+            res, pos = errorhandler(errors, "unicode_internal",
+                                    "unichr(%d) not in range" % (t,),
+                                    s, pos, pos + unicode_bytes)
+            result.append(res)
+            continue
+        result.append(unichr(t))
+        pos += unicode_bytes
+    return result.build(), pos
+
+def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
+    if size == 0:
+        return ''
+
+    if MAXUNICODE < 65536:
+        unicode_bytes = 2
+    else:
+        unicode_bytes = 4
+
+    result = StringBuilder(size * unicode_bytes)
+    pos = 0
+    while pos < size:
+        oc = ord(s[pos])
+        if MAXUNICODE < 65536:
+            if BYTEORDER == "little":
+                result.append(chr(oc       & 0xFF))
+                result.append(chr(oc >>  8 & 0xFF))
+            else:
+                result.append(chr(oc >>  8 & 0xFF))
+                result.append(chr(oc       & 0xFF))
+        else:
+            if BYTEORDER == "little":
+                result.append(chr(oc       & 0xFF))
+                result.append(chr(oc >>  8 & 0xFF))
+                result.append(chr(oc >> 16 & 0xFF))
+                result.append(chr(oc >> 24 & 0xFF))
+            else:
+                result.append(chr(oc >> 24 & 0xFF))
+                result.append(chr(oc >> 16 & 0xFF))
+                result.append(chr(oc >>  8 & 0xFF))
+                result.append(chr(oc       & 0xFF))
+        pos += 1
 
+    return result.build()
 
 # ____________________________________________________________
 # MBCS codecs for Windows



More information about the Pypy-commit mailing list