[pypy-commit] pypy utf8-unicode2: Steal functionality from runicode and start fixing _codec functions

waedt noreply at buildbot.pypy.org
Mon Jun 23 16:19:39 CEST 2014


Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72162:3f2065730015
Date: 2014-06-23 09:18 -0500
http://bitbucket.org/pypy/pypy/changeset/3f2065730015/

Log:	Steal functionality from runicode and start fixing _codec functions

diff too long, truncating to 2000 out of 3250 lines

diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -61,5 +61,3 @@
     assert s[0:1] == u'A'
     assert s[0:2] == u'A\u010F'
     assert s[1:2] == u'\u010F'
-    assert s[-4:-3] == u'A'
-    assert s[-4:-2] == u'A\u010F'
diff --git a/pypy/interpreter/test/test_utf8_codecs.py b/pypy/interpreter/test/test_utf8_codecs.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/test/test_utf8_codecs.py
@@ -0,0 +1,792 @@
+# -*- coding: utf-8 -*-
+
+import py
+import sys, random
+
+from pypy.interpreter.utf8 import Utf8Str
+from pypy.interpreter import utf8_codecs
+
+'''
+try:
+    import signal
+except ImportError:
+    pass
+else:
+    class MyKeyboardInterrupt(BaseException):
+        pass
+    def _interrupt(*args):
+        __tracebackhide__ = True
+        raise MyKeyboardInterrupt
+    signal.signal(signal.SIGINT, _interrupt)
+'''
+
+class UnicodeTests(object):
+    def typeequals(self, x, y):
+        assert x == y
+        assert type(x) is type(y)
+
+    def getdecoder(self, encoding):
+        return getattr(utf8_codecs, "str_decode_%s" % encoding.replace("-", "_"))
+
+    def getencoder(self, encoding):
+        return getattr(utf8_codecs,
+                       "unicode_encode_%s" % encoding.replace("-", "_"))
+
+    def checkdecode(self, s, encoding):
+        decoder = self.getdecoder(encoding)
+        try:
+            if isinstance(s, str):
+                trueresult = s.decode(encoding)
+            else:
+                trueresult = s
+                s = s.encode(encoding)
+        except LookupError, e:
+            py.test.skip(e)
+        trueresult = Utf8Str.from_unicode(trueresult)
+        result, consumed = decoder(s, len(s), True)
+        assert consumed == len(s)
+        self.typeequals(trueresult, result)
+
+    def checkencode(self, s, encoding):
+        encoder = self.getencoder(encoding)
+        try:
+            if isinstance(s, unicode):
+                trueresult = s.encode(encoding)
+            else:
+                trueresult = s
+                s = s.decode(encoding)
+        except LookupError, e:
+            py.test.skip(e)
+        s = Utf8Str.from_unicode(s)
+        result = encoder(s, len(s), True)
+        self.typeequals(trueresult, result)
+
+    def checkencodeerror(self, s, encoding, start, stop):
+        called = [False]
+        def errorhandler(errors, enc, msg, t, startingpos,
+                         endingpos):
+            called[0] = True
+            assert errors == "foo!"
+            assert enc == encoding
+            assert t is s
+            assert start == startingpos
+            assert stop == endingpos
+            return "42424242", None, stop
+        encoder = self.getencoder(encoding)
+        result = encoder(s, len(s), "foo!", errorhandler)
+        assert called[0]
+        assert "42424242" in result
+
+        # ensure bytes results passthru
+        def errorhandler_bytes(errors, enc, msg, t, startingpos,
+                               endingpos):
+            return None, '\xc3', endingpos
+        result = encoder(s, len(s), "foo!", errorhandler_bytes)
+        assert '\xc3' in result
+
+    def checkdecodeerror(self, s, encoding, start, stop,
+                         addstuff=True, msg=None):
+        called = [0]
+        def errorhandler(errors, enc, errmsg, t, startingpos,
+                         endingpos):
+            called[0] += 1
+            if called[0] == 1:
+                assert errors == "foo!"
+                assert enc == encoding.replace('-', '')
+                assert t is s
+                assert start == startingpos
+                assert stop == endingpos
+                if msg is not None:
+                    assert errmsg == msg
+                return "42424242", stop
+            return "", endingpos
+        decoder = self.getdecoder(encoding)
+        if addstuff:
+            s += "some rest in ascii"
+        result, _ = decoder(s, len(s), "foo!", True, errorhandler)
+        assert called[0] > 0
+        assert "42424242" in result
+        if addstuff:
+            assert result.endswith("some rest in ascii")
+
+
+class TestDecoding(UnicodeTests):
+    # XXX test bom recognition in utf-16
+    # XXX test proper error handling
+
+    def test_all_ascii(self):
+        for i in range(128):
+            for encoding in "utf-8 latin-1 ascii".split():
+                self.checkdecode(chr(i), encoding)
+
+    def test_all_first_256(self):
+        for i in range(256):
+            for encoding in ("utf-7 utf-8 latin-1 utf-16 utf-16-be utf-16-le "
+                             "utf-32 utf-32-be utf-32-le").split():
+                self.checkdecode(unichr(i), encoding)
+
+    def test_first_10000(self):
+        for i in range(10000):
+            for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
+                             "utf-32 utf-32-be utf-32-le").split():
+                if encoding == 'utf-8' and 0xd800 <= i <= 0xdfff:
+                    # Don't try to encode lone surrogates
+                    continue
+                self.checkdecode(unichr(i), encoding)
+
+    def test_random(self):
+        for i in range(10000):
+            v = random.randrange(sys.maxunicode)
+            if 0xd800 <= v <= 0xdfff:
+                continue
+            uni = unichr(v)
+            if sys.version >= "2.7":
+                self.checkdecode(uni, "utf-7")
+            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                             "utf-32 utf-32-be utf-32-le").split():
+                self.checkdecode(uni, encoding)
+
+    def test_maxunicode(self):
+        uni = unichr(sys.maxunicode)
+        if sys.version >= "2.7":
+            self.checkdecode(uni, "utf-7")
+        for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                         "utf-32 utf-32-be utf-32-le").split():
+            self.checkdecode(uni, encoding)
+
+    def test_ascii_error(self):
+        self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
+
+    def test_decode_replace(self):
+        decoder = self.getdecoder('utf-8')
+        assert decoder('caf\xe9', 4, 'replace', True) == (u'caf\ufffd', 4)
+
+    def test_utf16_errors(self):
+        # trunkated BOM
+        for s in ["\xff", "\xfe"]:
+            self.checkdecodeerror(s, "utf-16", 0, len(s), addstuff=False)
+
+        for s in [
+                  # unexpected end of data ascii
+                  "\xff\xfeF",
+                  # unexpected end of data
+                  '\xff\xfe\xc0\xdb\x00', '\xff\xfe\xc0\xdb', '\xff\xfe\xc0',
+                  ]:
+            self.checkdecodeerror(s, "utf-16", 2, len(s), addstuff=False)
+        for s in [
+                  # illegal surrogate
+                  "\xff\xfe\xff\xdb\xff\xff",
+                  ]:
+            self.checkdecodeerror(s, "utf-16", 2, 4, addstuff=False)
+
+    def test_utf16_bugs(self):
+        s = '\x80-\xe9\xdeL\xa3\x9b'
+        py.test.raises(UnicodeDecodeError, utf8_codecs.str_decode_utf_16_le,
+                       s, len(s), True)
+
+    def test_utf7_bugs(self):
+        u = Utf8Str.from_unicode(u'A\u2262\u0391.')
+        assert utf8_codecs.unicode_encode_utf_7(u, len(u), None) == 'A+ImIDkQ.'
+
+    def test_utf7_tofrom_utf8_bug(self):
+        def _assert_decu7(input, expected):
+            assert (utf8_codecs.str_decode_utf_7(input, len(input), None) ==
+                        (expected, len(input)))
+
+        _assert_decu7('+-', u'+')
+        _assert_decu7('+-+-', u'++')
+        _assert_decu7('+-+AOQ-', u'+\xe4')
+        _assert_decu7('+AOQ-', u'\xe4')
+        _assert_decu7('+AOQ-', u'\xe4')
+        _assert_decu7('+AOQ- ', u'\xe4 ')
+        _assert_decu7(' +AOQ-', u' \xe4')
+        _assert_decu7(' +AOQ- ', u' \xe4 ')
+        _assert_decu7('+AOQ-+AOQ-', u'\xe4\xe4')
+
+        s_utf7 = 'Die M+AOQ-nner +AOQ-rgen sich!'
+        s_utf8 = Utf8Str.from_unicode(u'Die Männer ärgen sich!')
+        s_utf8_esc = Utf8Str.from_unicode(u'Die M\xe4nner \xe4rgen sich!')
+
+        _assert_decu7(s_utf7, s_utf8_esc)
+        _assert_decu7(s_utf7, s_utf8)
+
+        assert utf8_codecs.unicode_encode_utf_7(s_utf8_esc, len(s_utf8_esc), None) == s_utf7
+        assert utf8_codecs.unicode_encode_utf_7(s_utf8,     len(s_utf8_esc), None) == s_utf7
+
+    def test_utf7_partial(self):
+        s = u"a+-b".encode('utf-7')
+        assert s == "a+--b"
+        decode = self.getdecoder('utf-7')
+        assert decode(s, 1, None) == (u'a', 1)
+        assert decode(s, 2, None) == (u'a', 1)
+        assert decode(s, 3, None) == (u'a+', 3)
+        assert decode(s, 4, None) == (u'a+-', 4)
+        assert decode(s, 5, None) == (u'a+-b', 5)
+
+    def test_utf7_surrogates(self):
+        encode_ = self.getencoder('utf-7')
+        encode = lambda u, s, err: encode_(Utf8Str.from_unicode(u), s, err)
+        decode = self.getdecoder('utf-7')
+
+        u = Utf8Str.from_unicode(u'\U000abcde')
+        assert encode_(u, len(u), None) == '+2m/c3g-'
+
+        # Unpaired surrogates are passed through
+        assert encode(u'\uD801', 1, None) == '+2AE-'
+        assert encode(u'\uD801x', 2, None) == '+2AE-x'
+        assert encode(u'\uDC01', 1, None) == '+3AE-'
+        assert encode(u'\uDC01x', 2, None) == '+3AE-x'
+        assert decode('+2AE-', 5, None) == (u'\uD801', 5)
+        assert decode('+2AE-x', 6, None) == (u'\uD801x', 6)
+        assert decode('+3AE-', 5, None) == (u'\uDC01', 5)
+        assert decode('+3AE-x', 6, None) == (u'\uDC01x', 6)
+
+        u = Utf8Str.from_unicode(u'\uD801\U000abcde')
+        assert encode_(u, len(u), None) == '+2AHab9ze-'
+        assert decode('+2AHab9ze-', 10, None) == (u'\uD801\U000abcde', 10)
+
+
+class TestUTF8Decoding(UnicodeTests):
+    def __init__(self):
+        self.decoder = self.getdecoder('utf-8')
+
+    def to_bytestring(self, bytes):
+        return ''.join(chr(int(c, 16)) for c in bytes.split())
+
+    def test_single_chars_utf8(self):
+        for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
+            self.checkdecode(s, "utf-8")
+
+    def test_utf8_surrogate(self):
+        # surrogates used to be allowed by python 2.x
+        py.test.raises(UnicodeDecodeError, self.checkdecode, u"\ud800", "utf-8")
+
+    def test_invalid_start_byte(self):
+        """
+        Test that an 'invalid start byte' error is raised when the first byte
+        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
+        4-bytes sequence. The invalid start byte is replaced with a single
+        U+FFFD when errors='replace'.
+        E.g. <80> is a continuation byte and can appear only after a start byte.
+        """
+        FFFD = u'\ufffd'
+        for byte in '\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
+            py.test.raises(UnicodeDecodeError, self.decoder, byte, 1, None, final=True)
+            self.checkdecodeerror(byte, 'utf-8', 0, 1, addstuff=False,
+                                  msg='invalid start byte')
+            assert self.decoder(byte, 1, 'replace', final=True) == (FFFD, 1)
+            assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'replace',
+                        final=True) ==
+                        (u'aaaa'+ FFFD + u'bbbb', 9))
+            assert self.decoder(byte, 1, 'ignore', final=True) == (u'', 1)
+            assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'ignore',
+                        final=True) == (u'aaaabbbb', 9))
+
+    def test_unexpected_end_of_data(self):
+        """
+        Test that an 'unexpected end of data' error is raised when the string
+        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
+        enough continuation bytes.  The incomplete sequence is replaced with a
+        single U+FFFD when errors='replace'.
+        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
+        sequence, but it's followed by only 2 valid continuation bytes and the
+        last continuation bytes is missing.
+        Note: the continuation bytes must be all valid, if one of them is
+        invalid another error will be raised.
+        """
+        sequences = [
+            'C2', 'DF',
+            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
+            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
+            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
+            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
+            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
+            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
+        ]
+        FFFD = u'\ufffd'
+        for seq in sequences:
+            seq = self.to_bytestring(seq)
+            py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq),
+                   None, final=True)
+            self.checkdecodeerror(seq, 'utf-8', 0, len(seq), addstuff=False,
+                                  msg='unexpected end of data')
+            assert self.decoder(seq, len(seq), 'replace', final=True
+                                ) == (FFFD, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
+                                 'replace', final=True) ==
+                        (u'aaaa'+ FFFD + u'bbbb', len(seq) + 8))
+            assert self.decoder(seq, len(seq), 'ignore', final=True
+                                ) == (u'', len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
+                        final=True) == (u'aaaabbbb', len(seq) + 8))
+
+    def test_invalid_cb_for_2bytes_seq(self):
+        """
+        Test that an 'invalid continuation byte' error is raised when the
+        continuation byte of a 2-bytes sequence is invalid.  The start byte
+        is replaced by a single U+FFFD and the second byte is handled
+        separately when errors='replace'.
+        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
+        sequence, but 41 is not a valid continuation byte because it's the
+        ASCII letter 'A'.
+        """
+        FFFD = u'\ufffd'
+        FFFDx2 = FFFD * 2
+        sequences = [
+            ('C2 00', FFFD+u'\x00'), ('C2 7F', FFFD+u'\x7f'),
+            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
+            ('DF 00', FFFD+u'\x00'), ('DF 7F', FFFD+u'\x7f'),
+            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
+        ]
+        for seq, res in sequences:
+            seq = self.to_bytestring(seq)
+            py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq),
+                   None, final=True)
+            self.checkdecodeerror(seq, 'utf-8', 0, 1, addstuff=False,
+                                  msg='invalid continuation byte')
+            assert self.decoder(seq, len(seq), 'replace', final=True
+                                ) == (res, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
+                                 'replace', final=True) ==
+                        (u'aaaa' + res + u'bbbb', len(seq) + 8))
+            res = res.replace(FFFD, u'')
+            assert self.decoder(seq, len(seq), 'ignore', final=True
+                                ) == (res, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
+                                 'ignore', final=True) ==
+                        (u'aaaa' + res + u'bbbb', len(seq) + 8))
+
+    def test_invalid_cb_for_3bytes_seq(self):
+        """
+        Test that an 'invalid continuation byte' error is raised when the
+        continuation byte(s) of a 3-bytes sequence are invalid.  When
+        errors='replace', if the first continuation byte is valid, the first
+        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
+        third byte is handled separately, otherwise only the start byte is
+        replaced with a U+FFFD and the other continuation bytes are handled
+        separately.
+        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
+        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
+        because it's the ASCII letter 'A'.
+        Note: when the start byte is E0 or ED, the valid ranges for the first
+        continuation byte are limited to A0..BF and 80..9F respectively.
+        However, when the start byte is ED, Python 2 considers all the bytes
+        in range 80..BF valid.  This is fixed in Python 3.
+        """
+        FFFD = u'\ufffd'
+        FFFDx2 = FFFD * 2
+        sequences = [
+            ('E0 00', FFFD+u'\x00'), ('E0 7F', FFFD+u'\x7f'), ('E0 80', FFFDx2),
+            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
+            ('E0 A0 00', FFFD+u'\x00'), ('E0 A0 7F', FFFD+u'\x7f'),
+            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
+            ('E0 BF 00', FFFD+u'\x00'), ('E0 BF 7F', FFFD+u'\x7f'),
+            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+u'\x00'),
+            ('E1 7F', FFFD+u'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
+            ('E1 80 00', FFFD+u'\x00'), ('E1 80 7F', FFFD+u'\x7f'),
+            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
+            ('E1 BF 00', FFFD+u'\x00'), ('E1 BF 7F', FFFD+u'\x7f'),
+            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+u'\x00'),
+            ('EC 7F', FFFD+u'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
+            ('EC 80 00', FFFD+u'\x00'), ('EC 80 7F', FFFD+u'\x7f'),
+            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
+            ('EC BF 00', FFFD+u'\x00'), ('EC BF 7F', FFFD+u'\x7f'),
+            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+u'\x00'),
+            ('ED 7F', FFFD+u'\x7f'),
+            # ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
+            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+u'\x00'),
+            ('ED 80 7F', FFFD+u'\x7f'), ('ED 80 C0', FFFDx2),
+            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+u'\x00'),
+            ('ED 9F 7F', FFFD+u'\x7f'), ('ED 9F C0', FFFDx2),
+            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+u'\x00'),
+            ('EE 7F', FFFD+u'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
+            ('EE 80 00', FFFD+u'\x00'), ('EE 80 7F', FFFD+u'\x7f'),
+            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
+            ('EE BF 00', FFFD+u'\x00'), ('EE BF 7F', FFFD+u'\x7f'),
+            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+u'\x00'),
+            ('EF 7F', FFFD+u'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
+            ('EF 80 00', FFFD+u'\x00'), ('EF 80 7F', FFFD+u'\x7f'),
+            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
+            ('EF BF 00', FFFD+u'\x00'), ('EF BF 7F', FFFD+u'\x7f'),
+            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
+        ]
+        for seq, res in sequences:
+            seq = self.to_bytestring(seq)
+            py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq),
+                   None, final=True)
+            self.checkdecodeerror(seq, 'utf-8', 0, len(seq)-1, addstuff=False,
+                                  msg='invalid continuation byte')
+            assert self.decoder(seq, len(seq), 'replace', final=True
+                                ) == (res, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8,
+                                 'replace', final=True) ==
+                        (u'aaaa' + res + u'bbbb', len(seq) + 8))
+            res = res.replace(FFFD, u'')
+            assert self.decoder(seq, len(seq), 'ignore', final=True
+                                ) == (res, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
+                        final=True) == (u'aaaa' + res + u'bbbb', len(seq) + 8))
+
+    def test_invalid_cb_for_4bytes_seq(self):
+        """
+        Test that an 'invalid continuation byte' error is raised when the
+        continuation byte(s) of a 4-bytes sequence are invalid.  When
+        errors='replace',the start byte and all the following valid
+        continuation bytes are replaced with a single U+FFFD, and all the bytes
+        starting from the first invalid continuation bytes (included) are
+        handled separately.
+        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
+        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
+        because it's the ASCII letter 'A'.
+        Note: when the start byte is E0 or ED, the valid ranges for the first
+        continuation byte are limited to A0..BF and 80..9F respectively.
+        However, when the start byte is ED, Python 2 considers all the bytes
+        in range 80..BF valid.  This is fixed in Python 3.
+        """
+        FFFD = u'\ufffd'
+        FFFDx2 = FFFD * 2
+        sequences = [
+            ('F0 00', FFFD+u'\x00'), ('F0 7F', FFFD+u'\x7f'), ('F0 80', FFFDx2),
+            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
+            ('F0 90 00', FFFD+u'\x00'), ('F0 90 7F', FFFD+u'\x7f'),
+            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
+            ('F0 BF 00', FFFD+u'\x00'), ('F0 BF 7F', FFFD+u'\x7f'),
+            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
+            ('F0 90 80 00', FFFD+u'\x00'), ('F0 90 80 7F', FFFD+u'\x7f'),
+            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
+            ('F0 90 BF 00', FFFD+u'\x00'), ('F0 90 BF 7F', FFFD+u'\x7f'),
+            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
+            ('F0 BF 80 00', FFFD+u'\x00'), ('F0 BF 80 7F', FFFD+u'\x7f'),
+            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
+            ('F0 BF BF 00', FFFD+u'\x00'), ('F0 BF BF 7F', FFFD+u'\x7f'),
+            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
+            ('F1 00', FFFD+u'\x00'), ('F1 7F', FFFD+u'\x7f'), ('F1 C0', FFFDx2),
+            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+u'\x00'),
+            ('F1 80 7F', FFFD+u'\x7f'), ('F1 80 C0', FFFDx2),
+            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+u'\x00'),
+            ('F1 BF 7F', FFFD+u'\x7f'), ('F1 BF C0', FFFDx2),
+            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+u'\x00'),
+            ('F1 80 80 7F', FFFD+u'\x7f'), ('F1 80 80 C0', FFFDx2),
+            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+u'\x00'),
+            ('F1 80 BF 7F', FFFD+u'\x7f'), ('F1 80 BF C0', FFFDx2),
+            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+u'\x00'),
+            ('F1 BF 80 7F', FFFD+u'\x7f'), ('F1 BF 80 C0', FFFDx2),
+            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+u'\x00'),
+            ('F1 BF BF 7F', FFFD+u'\x7f'), ('F1 BF BF C0', FFFDx2),
+            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+u'\x00'),
+            ('F3 7F', FFFD+u'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
+            ('F3 80 00', FFFD+u'\x00'), ('F3 80 7F', FFFD+u'\x7f'),
+            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
+            ('F3 BF 00', FFFD+u'\x00'), ('F3 BF 7F', FFFD+u'\x7f'),
+            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
+            ('F3 80 80 00', FFFD+u'\x00'), ('F3 80 80 7F', FFFD+u'\x7f'),
+            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
+            ('F3 80 BF 00', FFFD+u'\x00'), ('F3 80 BF 7F', FFFD+u'\x7f'),
+            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
+            ('F3 BF 80 00', FFFD+u'\x00'), ('F3 BF 80 7F', FFFD+u'\x7f'),
+            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
+            ('F3 BF BF 00', FFFD+u'\x00'), ('F3 BF BF 7F', FFFD+u'\x7f'),
+            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
+            ('F4 00', FFFD+u'\x00'), ('F4 7F', FFFD+u'\x7f'), ('F4 90', FFFDx2),
+            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
+            ('F4 80 00', FFFD+u'\x00'), ('F4 80 7F', FFFD+u'\x7f'),
+            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
+            ('F4 8F 00', FFFD+u'\x00'), ('F4 8F 7F', FFFD+u'\x7f'),
+            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
+            ('F4 80 80 00', FFFD+u'\x00'), ('F4 80 80 7F', FFFD+u'\x7f'),
+            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
+            ('F4 80 BF 00', FFFD+u'\x00'), ('F4 80 BF 7F', FFFD+u'\x7f'),
+            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
+            ('F4 8F 80 00', FFFD+u'\x00'), ('F4 8F 80 7F', FFFD+u'\x7f'),
+            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
+            ('F4 8F BF 00', FFFD+u'\x00'), ('F4 8F BF 7F', FFFD+u'\x7f'),
+            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
+        ]
+        for seq, res in sequences:
+            seq = self.to_bytestring(seq)
+            py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq),
+                   None, final=True)
+            self.checkdecodeerror(seq, 'utf-8', 0, len(seq)-1, addstuff=False,
+                                  msg='invalid continuation byte')
+            assert self.decoder(seq, len(seq), 'replace', final=True
+                                ) == (res, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 
+                                 'replace', final=True) ==
+                        (u'aaaa' + res + u'bbbb', len(seq) + 8))
+            res = res.replace(FFFD, u'')
+            assert self.decoder(seq, len(seq), 'ignore', final=True
+                                ) == (res, len(seq))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
+                        final=True) == (u'aaaa' + res + u'bbbb', len(seq) + 8))
+
+    def test_utf8_errors(self):
+        # unexpected end of data
+        for s in ['\xd7', '\xd6', '\xeb\x96', '\xf0\x90\x91', '\xc2', '\xdf']:
+            self.checkdecodeerror(s, 'utf-8', 0, len(s), addstuff=False,
+                                  msg='unexpected end of data')
+
+        # invalid data 2 byte
+        for s in ["\xd7\x50", "\xd6\x06", "\xd6\xD6"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True,
+                                  msg='invalid continuation byte')
+        # invalid data 3 byte
+        for s in ["\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True,
+                                  msg='invalid continuation byte')
+        for s in ["\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5"]:
+            self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True,
+                                  msg='invalid continuation byte')
+        # invalid data 4 byte
+        for s in ["\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True,
+                                  msg='invalid continuation byte')
+        for s in ["\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93"]:
+            self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True,
+                                  msg='invalid continuation byte')
+        for s in ["\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3"]:
+            self.checkdecodeerror(s, "utf-8", 0, 3, addstuff=True,
+                                  msg='invalid continuation byte')
+
+    def test_issue8271(self):
+        # From CPython
+        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
+        # only the start byte and the continuation byte(s) are now considered
+        # invalid, instead of the number of bytes specified by the start byte.
+        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
+        # table 3-8, Row 2) for more information about the algorithm used.
+        FFFD = u'\ufffd'
+        sequences = [
+            # invalid start bytes
+            ('\x80', FFFD), # continuation byte
+            ('\x80\x80', FFFD*2), # 2 continuation bytes
+            ('\xc0', FFFD),
+            ('\xc0\xc0', FFFD*2),
+            ('\xc1', FFFD),
+            ('\xc1\xc0', FFFD*2),
+            ('\xc0\xc1', FFFD*2),
+            # with start byte of a 2-byte sequence
+            ('\xc2', FFFD), # only the start byte
+            ('\xc2\xc2', FFFD*2), # 2 start bytes
+            ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
+            ('\xc2\x41', FFFD+'A'), # invalid continuation byte
+            # with start byte of a 3-byte sequence
+            ('\xe1', FFFD), # only the start byte
+            ('\xe1\xe1', FFFD*2), # 2 start bytes
+            ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
+            ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
+            ('\xe1\x80', FFFD), # only 1 continuation byte
+            ('\xe1\x41', FFFD+'A'), # invalid continuation byte
+            ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
+            ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
+            ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
+            ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
+            ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
+            # with start byte of a 4-byte sequence
+            ('\xf1', FFFD), # only the start byte
+            ('\xf1\xf1', FFFD*2), # 2 start bytes
+            ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
+            ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
+            ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
+            ('\xf1\x80', FFFD), # only 1 continuation bytes
+            ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
+            ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
+            ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
+            ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
+            ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
+            ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
+            ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
+            ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
+            ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
+            ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
+            ('\xf1\xf1\x80\x41', FFFD*2+'A'),
+            ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
+            # with invalid start byte of a 4-byte sequence (rfc2279)
+            ('\xf5', FFFD), # only the start byte
+            ('\xf5\xf5', FFFD*2), # 2 start bytes
+            ('\xf5\x80', FFFD*2), # only 1 continuation byte
+            ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
+            ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
+            ('\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
+            ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
+            ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
+            # with invalid start byte of a 5-byte sequence (rfc2279)
+            ('\xf8', FFFD), # only the start byte
+            ('\xf8\xf8', FFFD*2), # 2 start bytes
+            ('\xf8\x80', FFFD*2), # only one continuation byte
+            ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
+            ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
+            # with invalid start byte of a 6-byte sequence (rfc2279)
+            ('\xfc', FFFD), # only the start byte
+            ('\xfc\xfc', FFFD*2), # 2 start bytes
+            ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
+            ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
+            # invalid start byte
+            ('\xfe', FFFD),
+            ('\xfe\x80\x80', FFFD*3),
+            # other sequences
+            ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
+            ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
+            ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
+            ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
+             u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+        ]
+
+        for n, (seq, res) in enumerate(sequences):
+            decoder = self.getdecoder('utf-8')
+            py.test.raises(UnicodeDecodeError, decoder, seq, len(seq), None, final=True)
+            assert decoder(seq, len(seq), 'replace', final=True
+                           ) == (res, len(seq))
+            assert decoder(seq + 'b', len(seq) + 1, 'replace', final=True
+                           ) == (res + u'b', len(seq) + 1)
+            res = res.replace(FFFD, u'')
+            assert decoder(seq, len(seq), 'ignore', final=True
+                           ) == (res, len(seq))
+
+
+class TestEncoding(UnicodeTests):
+    def test_all_ascii(self):
+        for i in range(128):
+            if sys.version >= "2.7":
+                self.checkencode(unichr(i), "utf-7")
+            for encoding in "utf-8 latin-1 ascii".split():
+                self.checkencode(unichr(i), encoding)
+
+    def test_all_first_256(self):
+        for i in range(256):
+            if sys.version >= "2.7":
+                self.checkencode(unichr(i), "utf-7")
+            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                             "utf-32 utf-32-be utf-32-le").split():
+                self.checkencode(unichr(i), encoding)
+
+    def test_first_10000(self):
+        for i in range(10000):
+            if sys.version >= "2.7":
+                self.checkencode(unichr(i), "utf-7")
+            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                             "utf-32 utf-32-be utf-32-le").split():
+                self.checkencode(unichr(i), encoding)
+
+    def test_random(self):
+        for i in range(10000):
+            v = random.randrange(sys.maxunicode)
+            if 0xd800 <= v <= 0xdfff:
+                continue
+            uni = unichr(v)
+            if sys.version >= "2.7":
+                self.checkencode(uni, "utf-7")
+            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                             "utf-32 utf-32-be utf-32-le").split():
+                self.checkencode(uni, encoding)
+
+    def test_maxunicode(self):
+        uni = unichr(sys.maxunicode)
+        if sys.version >= "2.7":
+            self.checkencode(uni, "utf-7")
+        for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                         "utf-32 utf-32-be utf-32-le").split():
+            self.checkencode(uni, encoding)
+
+    def test_empty(self):
+        for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+                         "utf-32 utf-32-be utf-32-le").split():
+            self.checkencode(u'', encoding)
+
+    def test_single_chars_utf8(self):
+        # check every number of bytes per char
+        for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
+            self.checkencode(s, "utf-8")
+
+    # TODO: Is this test useful?
+    def test_utf8_surrogates(self):
+        # make sure that the string itself is not marshalled
+        u = u"\ud800"
+        for i in range(4):
+            u += u"\udc00"
+
+        if utf8_codecs.MAXUNICODE < 65536:
+            # Check replacing of two surrogates by single char while encoding
+            self.checkencode(u, "utf-8")
+        else:
+            # This is not done in wide unicode builds
+            py.test.raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
+
+    def test_ascii_error(self):
+        self.checkencodeerror(
+            Utf8Str.from_unicode(u"abc\xFF\xFF\xFFcde"), "ascii", 3, 6)
+
+    def test_latin1_error(self):
+        self.checkencodeerror(
+            Utf8Str.from_unicode(u"abc\uffff\uffff\uffffcde"), "latin-1", 3, 6)
+
+    def test_mbcs(self):
+        if sys.platform != 'win32':
+            py.test.skip("mbcs encoding is win32-specific")
+        self.checkencode(u'encoding test', "mbcs")
+        self.checkdecode('decoding test', "mbcs")
+        # XXX test this on a non-western Windows installation
+        self.checkencode(u"\N{GREEK CAPITAL LETTER PHI}", "mbcs") # a F
+        self.checkencode(u"\N{GREEK CAPITAL LETTER PSI}", "mbcs") # a ?
+
+    def test_mbcs_decode_force_ignore(self):
+        if sys.platform != 'win32':
+            py.test.skip("mbcs encoding is win32-specific")
+
+        # XXX: requires a locale w/ a restrictive encoding to test
+        from rpython.rlib.rlocale import getdefaultlocale
+        if getdefaultlocale()[1] != 'cp932':
+            py.test.skip("requires cp932 locale")
+
+        s = '\xff\xf4\x8f\xbf\xbf'
+        decoder = self.getdecoder('mbcs')
+        assert decoder(s, len(s), 'strict') == (u'\U0010ffff', 5)
+        py.test.raises(UnicodeEncodeError, decoder, s, len(s), 'strict',
+                       force_ignore=False)
+
+    def test_mbcs_encode_force_replace(self):
+        if sys.platform != 'win32':
+            py.test.skip("mbcs encoding is win32-specific")
+        u = u'@test_2224_tmp-?L??\udc80'
+        encoder = self.getencoder('mbcs')
+        assert encoder(u, len(u), 'strict') == '@test_2224_tmp-?L???'
+        py.test.raises(UnicodeEncodeError, encoder, u, len(u), 'strict',
+                       force_replace=False)
+
+    def test_encode_decimal(self):
+        encoder = self.getencoder('decimal')
+        assert encoder(u' 12, 34 ', 8, None) == ' 12, 34 '
+        py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
+        assert encoder(u'u\u1234', 2, 'replace') == 'u?'
+
+
+# TODO: Do I need to actually skip these?
+class TestTranslation(object):
+    def setup_class(cls):
+        if utf8_codecs.MAXUNICODE != sys.maxunicode:
+            py.test.skip("these tests cannot run on the llinterp")
+
+    def test_utf8(self):
+        from rpython.rtyper.test.test_llinterp import interpret
+        def f(x):
+
+            s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x)
+            u, consumed = utf8_codecs.str_decode_utf_8(s1, len(s1), True)
+            s2 = utf8_codecs.unicode_encode_utf_8(u, len(u), True)
+            return s1 == s2
+        res = interpret(f, [2])
+        assert res
+
+    def test_encode_surrogate_pair(self):
+        u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
+        if runicode.MAXUNICODE < 65536:
+            # Narrow unicode build, consider utf16 surrogate pairs
+            assert runicode.unicode_encode_unicode_escape(
+                u, len(u), True) == r'\U00010000'
+            assert runicode.unicode_encode_raw_unicode_escape(
+                u, len(u), True) == r'\U00010000'
+        else:
+            # Wide unicode build, don't merge utf16 surrogate pairs
+            assert runicode.unicode_encode_unicode_escape(
+                u, len(u), True) == r'\ud800\udc00'
+            assert runicode.unicode_encode_raw_unicode_escape(
+                u, len(u), True) == r'\ud800\udc00'
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -34,25 +34,25 @@
 
 # These functions take and return unwrapped rpython strings and unicodes
 def decode_unicode_escape(space, string):
-    from pypy.interpreter.utf8 import decode_unicode_escape
+    from pypy.interpreter.utf8_codecs import str_decode_unicode_escape
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    result, consumed = decode_unicode_escape(
+    result, consumed = str_decode_unicode_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         unicodedata_handler=unicodedata_handler)
     return result
 
 def decode_raw_unicode_escape(space, string):
-    from pypy.interpreter.utf8 import decode_raw_unicode_escape
-    result, consumed = decode_raw_unicode_escape(
+    from pypy.interpreter.utf8_codecs import str_decode_raw_unicode_escape
+    result, consumed = str_decode_raw_unicode_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space))
     return result
 
 def decode_utf8(space, string):
-    from pypy.interpreter.utf8 import decode_utf_8
-    result, consumed = decode_utf_8(
+    from pypy.interpreter.utf8_codecs import str_decode_utf_8
+    result, consumed = str_decode_utf_8(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         allow_surrogates=True)
@@ -60,4 +60,4 @@
 
 def encode_utf8(space, uni):
     # unicode to string...
-    return s.bytes
+    return uni.bytes
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,8 +1,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.runicode import utf8_code_length
-
-MAXUNICODE = 0x10ffff
+from rpython.rlib.rarithmetic import r_uint
 
 def utf8chr(value):
     # Like unichr, but returns a Utf8Str object
@@ -91,7 +90,7 @@
         stop_pos = start
         # TODO: Is detecting ascii-ness here actually useful? If it will
         #       happen in __init__ anyway, maybe its not worth the extra
-        #       complexity.
+        #       (code) complexity.
         is_ascii = True
         while stop_pos < stop:
             stop_pos += 1
@@ -115,6 +114,54 @@
 
         return False
 
+    @specialize.argtype(1)
+    def __contains__(self, other):
+        if isinstance(other, Utf8Str):
+            return other.bytes in self.bytes
+        if isinstance(other, unicode):
+            # TODO: Assert fail if translated
+            return other in unicode(self.bytes, 'utf8')
+        if isinstance(other, str):
+            return other in self.bytes
+
+        raise TypeError()
+
+    def __iter__(self):
+        byte_pos = 0
+        while byte_pos < len(self.bytes):
+            cplen = utf8_code_length[ord(self.bytes[byte_pos])]
+            yield Utf8Str(self.bytes[byte_pos:byte_pos+cplen])
+            byte_pos += cplen
+
+    @specialize.argtype(1)
+    def find(self, other):
+        if isinstance(other, Utf8Str):
+            return self.bytes.find(other.bytes)
+        if isinstance(other, unicode):
+            return unicode(self.bytes, 'utf8').find(other)
+        if isinstance(other, str):
+            return self.bytes.find(other)
+
+    def rfind(self, other):
+        if isinstance(other, Utf8Str):
+            return self.bytes.rfind(other.bytes)
+        if isinstance(other, unicode):
+            return unicode(self.bytes, 'utf8').rfind(other)
+        if isinstance(other, str):
+            return self.bytes.rfind(other)
+
+    def endswith(self, other):
+        return self.rfind(other) == len(self) - len(other)
+
+    def as_unicode(self):
+        """NOT_RPYTHON"""
+        return self.bytes.decode('utf-8')
+
+    @staticmethod
+    def from_unicode(u):
+        """NOT_RPYTHON"""
+        return Utf8Str(u.encode('utf-8'))
+
 class Utf8Builder(object):
     @specialize.argtype(1)
     def __init__(self, init_size=None):
@@ -127,7 +174,7 @@
 
     @specialize.argtype(1)
     def append(self, c):
-        if isinstance(c, int):
+        if isinstance(c, int) or isinstance(c, r_uint):
             if c < 0x80:
                 self._builder.append(chr(c))
             elif c < 0x800:
@@ -147,8 +194,14 @@
                 self._is_ascii = False
             else:
                 raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
+        elif isinstance(c, Utf8Str):
+            self._builder.append(c.bytes)
+            if not c._is_ascii:
+                self._is_ascii = False
         else:
-            # TODO: Only allow ord(c) in [0, 127]
+            # TODO: Remove this check?
+            if len(c) == 1:
+                assert ord(c) < 128
             self._builder.append(c)
 
     def append_slice(self, s, start, end, is_ascii=False):
@@ -159,411 +212,3 @@
     def build(self):
         return Utf8Str(self._builder.build(), self._is_ascii)
 
-
-# ____________________________________________________________
-# Escape-parsing functions
-
-def decode_raw_unicode_escape(s, size, errors, final=False,
-                                  errorhandler=None):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
-    if size == 0:
-        # TODO:?
-        return Utf8Str('', True), 0
-
-    result = Utf8Builder(size)
-    pos = 0
-    while pos < size:
-        ch = s[pos]
-
-        # Non-escape characters are interpreted as Unicode ordinals
-        if ch != '\\':
-            result.append(ch)
-            pos += 1
-            continue
-
-        # \u-escapes are only interpreted iff the number of leading
-        # backslashes is odd
-        bs = pos
-        while pos < size:
-            pos += 1
-            if pos == size or s[pos] != '\\':
-                break
-            result.append('\\')
-
-        # we have a backslash at the end of the string, stop here
-        if pos >= size:
-            result.append('\\')
-            break
-
-        if ((pos - bs) & 1 == 0 or
-            pos >= size or
-            (s[pos] != 'u' and s[pos] != 'U')):
-            result.append('\\')
-            result.append(s[pos])
-            pos += 1
-            continue
-
-        digits = 4 if s[pos] == 'u' else 8
-        message = "truncated \\uXXXX"
-        pos += 1
-        pos = hexescape(result, s, pos, digits,
-                        "rawunicodeescape", errorhandler, message, errors)
-
-    return result.build(), pos
-
-# Specialize on the errorhandler when it's a constant
- at specialize.arg_or_var(4)
-def decode_unicode_escape(s, size, errors, final=False,
-                              errorhandler=None,
-                              unicodedata_handler=None):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
-
-    if size == 0:
-        return Utf8Str('', True), 0
-
-    builder = Utf8Builder(size)
-    pos = 0
-    while pos < size:
-        ch = s[pos]
-
-        # Non-escape characters are interpreted as Unicode ordinals
-        if ch != '\\':
-            builder.append(ch)
-            pos += 1
-            continue
-
-        # - Escapes
-        pos += 1
-        if pos >= size:
-            message = "\\ at end of string"
-            res, pos = errorhandler(errors, "unicodeescape",
-                                    message, s, pos-1, size)
-            builder.append(res)
-            continue
-
-        ch = s[pos]
-        pos += 1
-        # \x escapes
-        if ch == '\n': pass
-        elif ch == '\\': builder.append('\\')
-        elif ch == '\'': builder.append('\'')
-        elif ch == '\"': builder.append('\"')
-        elif ch == 'b' : builder.append('\b')
-        elif ch == 'f' : builder.append('\f')
-        elif ch == 't' : builder.append('\t')
-        elif ch == 'n' : builder.append('\n')
-        elif ch == 'r' : builder.append('\r')
-        elif ch == 'v' : builder.append('\v')
-        elif ch == 'a' : builder.append('\a')
-        elif '0' <= ch <= '7':
-            x = ord(ch) - ord('0')
-            if pos < size:
-                ch = s[pos]
-                if '0' <= ch <= '7':
-                    pos += 1
-                    x = (x<<3) + ord(ch) - ord('0')
-                    if pos < size:
-                        ch = s[pos]
-                        if '0' <= ch <= '7':
-                            pos += 1
-                            x = (x<<3) + ord(ch) - ord('0')
-            builder.append(x)
-        # hex escapes
-        # \xXX
-        elif ch == 'x':
-            digits = 2
-            message = "truncated \\xXX escape"
-            pos = hexescape(builder, s, pos, digits,
-                            "unicodeescape", errorhandler, message, errors)
-
-        # \uXXXX
-        elif ch == 'u':
-            digits = 4
-            message = "truncated \\uXXXX escape"
-            pos = hexescape(builder, s, pos, digits,
-                            "unicodeescape", errorhandler, message, errors)
-
-        #  \UXXXXXXXX
-        elif ch == 'U':
-            digits = 8
-            message = "truncated \\UXXXXXXXX escape"
-            pos = hexescape(builder, s, pos, digits,
-                            "unicodeescape", errorhandler, message, errors)
-
-        # \N{name}
-        elif ch == 'N':
-            message = "malformed \\N character escape"
-            look = pos
-            if unicodedata_handler is None:
-                message = ("\\N escapes not supported "
-                           "(can't load unicodedata module)")
-                res, pos = errorhandler(errors, "unicodeescape",
-                                        message, s, pos-1, size)
-                builder.append(res)
-                continue
-
-            if look < size and s[look] == '{':
-                # look for the closing brace
-                while look < size and s[look] != '}':
-                    look += 1
-                if look < size and s[look] == '}':
-                    # found a name.  look it up in the unicode database
-                    message = "unknown Unicode character name"
-                    name = s[pos+1:look]
-                    code = unicodedata_handler.call(name)
-                    if code < 0:
-                        res, pos = errorhandler(errors, "unicodeescape",
-                                                message, s, pos-1, look+1)
-                        builder.append(res)
-                        continue
-                    pos = look + 1
-                    builder.append(code)
-                else:
-                    res, pos = errorhandler(errors, "unicodeescape",
-                                            message, s, pos-1, look+1)
-                    builder.append(res)
-            else:
-                res, pos = errorhandler(errors, "unicodeescape",
-                                        message, s, pos-1, look+1)
-                builder.append(res)
-        else:
-            builder.append('\\')
-            builder.append(ch)
-
-    return builder.build(), pos
-
-hexdigits = "0123456789ABCDEFabcdef"
-
-def hexescape(builder, s, pos, digits,
-              encoding, errorhandler, message, errors):
-    chr = 0
-    if pos + digits > len(s):
-        endinpos = pos
-        while endinpos < len(s) and s[endinpos] in hexdigits:
-            endinpos += 1
-        res, pos = errorhandler(errors, encoding,
-                                message, s, pos-2, endinpos)
-        builder.append(res)
-    else:
-        try:
-            chr = r_uint(int(s[pos:pos+digits], 16))
-        except ValueError:
-            endinpos = pos
-            while s[endinpos] in hexdigits:
-                endinpos += 1
-            res, pos = errorhandler(errors, encoding,
-                                    message, s, pos-2, endinpos)
-            builder.append(res)
-        else:
-            # when we get here, chr is a 32-bit unicode character
-            if chr <= MAXUNICODE:
-                builder.append(chr)
-                pos += digits
-
-            else:
-                message = "illegal Unicode character"
-                res, pos = errorhandler(errors, encoding,
-                                        message, s, pos-2, pos+digits)
-                builder.append(res)
-    return pos
-
-# ____________________________________________________________
-
-# Converting bytes (utf8) to unicode?
-# I guess we just make sure we're looking at valid utf-8 and then make the
-# object?
-
-def decode_utf_8(s, size, errors, final=False,
-                     errorhandler=None, allow_surrogates=False):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
-    result = Utf8Builder(size)
-    pos = decode_utf_8_impl(s, size, errors, final, errorhandler, result,
-                            allow_surrogates=allow_surrogates)
-    return result.build(), pos
-
-def decode_utf_8_impl(s, size, errors, final, errorhandler, result,
-                      allow_surrogates):
-    if size == 0:
-        return 0
-
-    # TODO: Instead of assembling and then re-disassembling the codepoints,
-    #       just use builder.append_slice
-    pos = 0
-    while pos < size:
-        ordch1 = ord(s[pos])
-        # fast path for ASCII
-        # XXX maybe use a while loop here
-        if ordch1 < 0x80:
-            result.append(ordch1)
-            pos += 1
-            continue
-
-        n = utf8_code_length[ordch1]
-        if pos + n > size:
-            if not final:
-                break
-            charsleft = size - pos - 1 # either 0, 1, 2
-            # note: when we get the 'unexpected end of data' we don't care
-            # about the pos anymore and we just ignore the value
-            if not charsleft:
-                # there's only the start byte and nothing else
-                r, pos = errorhandler(errors, 'utf8',
-                                      'unexpected end of data',
-                                      s, pos, pos+1)
-                result.append(r)
-                break
-            ordch2 = ord(s[pos+1])
-            if n == 3:
-                # 3-bytes seq with only a continuation byte
-                if (ordch2>>6 != 0x2 or   # 0b10
-                    (ordch1 == 0xe0 and ordch2 < 0xa0)):
-                    # or (ordch1 == 0xed and ordch2 > 0x9f)
-                    # second byte invalid, take the first and continue
-                    r, pos = errorhandler(errors, 'utf8',
-                                          'invalid continuation byte',
-                                          s, pos, pos+1)
-                    result.append(r)
-                    continue
-                else:
-                    # second byte valid, but third byte missing
-                    r, pos = errorhandler(errors, 'utf8',
-                                      'unexpected end of data',
-                                      s, pos, pos+2)
-                    result.append(r)
-                    break
-            elif n == 4:
-                # 4-bytes seq with 1 or 2 continuation bytes
-                if (ordch2>>6 != 0x2 or    # 0b10
-                    (ordch1 == 0xf0 and ordch2 < 0x90) or
-                    (ordch1 == 0xf4 and ordch2 > 0x8f)):
-                    # second byte invalid, take the first and continue
-                    r, pos = errorhandler(errors, 'utf8',
-                                          'invalid continuation byte',
-                                          s, pos, pos+1)
-                    result.append(r)
-                    continue
-                elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2:   # 0b10
-                    # third byte invalid, take the first two and continue
-                    r, pos = errorhandler(errors, 'utf8',
-                                          'invalid continuation byte',
-                                          s, pos, pos+2)
-                    result.append(r)
-                    continue
-                else:
-                    # there's only 1 or 2 valid cb, but the others are missing
-                    r, pos = errorhandler(errors, 'utf8',
-                                      'unexpected end of data',
-                                      s, pos, pos+charsleft+1)
-                    result.append(r)
-                    break
-
-        if n == 0:
-            r, pos = errorhandler(errors, 'utf8',
-                                  'invalid start byte',
-                                  s, pos, pos+1)
-            result.append(r)
-
-        elif n == 1:
-            assert 0, "ascii should have gone through the fast path"
-
-        elif n == 2:
-            ordch2 = ord(s[pos+1])
-            if ordch2>>6 != 0x2:   # 0b10
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+1)
-                result.append(r)
-                continue
-            # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
-            result.append(((ordch1 & 0x1F) << 6) +    # 0b00011111
-                           (ordch2 & 0x3F))           # 0b00111111
-            pos += 2
-
-        elif n == 3:
-            ordch2 = ord(s[pos+1])
-            ordch3 = ord(s[pos+2])
-            if (ordch2>>6 != 0x2 or    # 0b10
-                (ordch1 == 0xe0 and ordch2 < 0xa0)
-                # surrogates shouldn't be valid UTF-8!
-                or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
-                ):
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+1)
-                result.append(r)
-                continue
-            elif ordch3>>6 != 0x2:     # 0b10
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+2)
-                result.append(r)
-                continue
-            # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
-            result.append((((ordch1 & 0x0F) << 12) +     # 0b00001111
-                           ((ordch2 & 0x3F) << 6) +      # 0b00111111
-                            (ordch3 & 0x3F)))            # 0b00111111
-            pos += 3
-
-        elif n == 4:
-            ordch2 = ord(s[pos+1])
-            ordch3 = ord(s[pos+2])
-            ordch4 = ord(s[pos+3])
-            if (ordch2>>6 != 0x2 or     # 0b10
-                (ordch1 == 0xf0 and ordch2 < 0x90) or
-                (ordch1 == 0xf4 and ordch2 > 0x8f)):
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+1)
-                result.append(r)
-                continue
-            elif ordch3>>6 != 0x2:     # 0b10
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+2)
-                result.append(r)
-                continue
-            elif ordch4>>6 != 0x2:     # 0b10
-                r, pos = errorhandler(errors, 'utf8',
-                                      'invalid continuation byte',
-                                      s, pos, pos+3)
-                result.append(r)
-                continue
-            # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
-            c = (((ordch1 & 0x07) << 18) +      # 0b00000111
-                 ((ordch2 & 0x3F) << 12) +      # 0b00111111
-                 ((ordch3 & 0x3F) << 6) +       # 0b00111111
-                 (ordch4 & 0x3F))               # 0b00111111
-
-            # TODO: Why doesn't this raise an error when c > MAXUNICODE? If I'm
-            #       converting utf8 -> utf8 is this necessary
-            if c <= MAXUNICODE:
-                result.append(c)
-            pos += 4
-
-    return pos
-
-# ____________________________________________________________
-# Default error handlers
-
-
-def default_unicode_error_decode(errors, encoding, msg, s,
-                                 startingpos, endingpos):
-    if errors == 'replace':
-        return _unicode_error_replacement, endingpos
-    if errors == 'ignore':
-        return '', endingpos
-    raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
-_unicode_error_replacement = decode_raw_unicode_escape(
-    '\ufffd', 1, default_unicode_error_decode)
-
-def default_unicode_error_encode(errors, encoding, msg, u,
-                                 startingpos, endingpos):
-    if errors == 'replace':
-        return '?', None, endingpos
-    if errors == 'ignore':
-        return '', None, endingpos
-    raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
-
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/utf8_codecs.py
@@ -0,0 +1,1598 @@
+import sys
+
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.objectmodel import specialize
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.unicodedata import unicodedb
+from rpython.rlib.runicode import utf8_code_length
+
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
+
+
+BYTEORDER = sys.byteorder
+MAXUNICODE = 0x10ffff
+
+# ____________________________________________________________
+# Unicode escape {{{
+
+# Specialize on the errorhandler when it's a constant
+ at specialize.arg_or_var(4)
+def str_decode_unicode_escape(s, size, errors, final=False,
+                              errorhandler=None,
+                              unicodedata_handler=None):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+
+    if size == 0:
+        return Utf8Str('', True), 0
+
+    builder = Utf8Builder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            builder.append(ch)
+            pos += 1
+            continue
+
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-1, size)
+            builder.append(res)
+            continue
+
+        ch = s[pos]
+        pos += 1
+        # \x escapes
+        if ch == '\n': pass
+        elif ch == '\\': builder.append('\\')
+        elif ch == '\'': builder.append('\'')
+        elif ch == '\"': builder.append('\"')
+        elif ch == 'b' : builder.append('\b')
+        elif ch == 'f' : builder.append('\f')
+        elif ch == 't' : builder.append('\t')
+        elif ch == 'n' : builder.append('\n')
+        elif ch == 'r' : builder.append('\r')
+        elif ch == 'v' : builder.append('\v')
+        elif ch == 'a' : builder.append('\a')
+        elif '0' <= ch <= '7':
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x<<3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x<<3) + ord(ch) - ord('0')
+            builder.append(x)
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \uXXXX
+        elif ch == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \N{name}
+        elif ch == 'N':
+            message = "malformed \\N character escape"
+            look = pos
+            if unicodedata_handler is None:
+                message = ("\\N escapes not supported "
+                           "(can't load unicodedata module)")
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, size)
+                builder.append(res)
+                continue
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while look < size and s[look] != '}':
+                    look += 1
+                if look < size and s[look] == '}':
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos+1:look]
+                    code = unicodedata_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(errors, "unicodeescape",
+                                                message, s, pos-1, look+1)
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    builder.append(code)
+                else:
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos-1, look+1)
+                    builder.append(res)
+            else:
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, look+1)
+                builder.append(res)
+        else:
+            builder.append('\\')
+            builder.append(ch)
+
+    return builder.build(), pos
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits,
+              encoding, errorhandler, message, errors):
+    chr = 0
+    if pos + digits > len(s):
+        endinpos = pos
+        while endinpos < len(s) and s[endinpos] in hexdigits:
+            endinpos += 1
+        res, pos = errorhandler(errors, encoding,
+                                message, s, pos-2, endinpos)
+        builder.append(res)
+    else:
+        try:
+            chr = r_uint(int(s[pos:pos+digits], 16))
+        except ValueError:
+            endinpos = pos
+            while s[endinpos] in hexdigits:
+                endinpos += 1
+            res, pos = errorhandler(errors, encoding,
+                                    message, s, pos-2, endinpos)
+            builder.append(res)
+        else:
+            # when we get here, chr is a 32-bit unicode character
+            if chr <= MAXUNICODE:
+                builder.append(chr)
+                pos += digits
+
+            else:
+                message = "illegal Unicode character"
+                res, pos = errorhandler(errors, encoding,
+                                        message, s, pos-2, pos+digits)
+                builder.append(res)
+    return pos
+
+def make_unicode_escape_function(pass_printable=False, unicode_output=False,
+                                 quotes=False, prefix=None):
+    # Python3 has two similar escape functions: One to implement
+    # encode('unicode_escape') and which outputs bytes, and unicode.__repr__
+    # which outputs unicode.  They cannot share RPython code, so we generate
+    # them with the template below.
+    # Python2 does not really need this, but it reduces diffs between branches.
+
+    if unicode_output:
+        STRING_BUILDER = Utf8Builder
+        STR = Utf8Str
+    else:
+        STRING_BUILDER = StringBuilder
+        STR = str
+
+    def unicode_escape(s, size, errors, errorhandler=None):
+        # errorhandler is not used: this function cannot cause Unicode errors
+        result = STRING_BUILDER(size)
+
+        if quotes:
+            if prefix:
+                result.append(prefix)
+            if s.find('\'') != -1 and s.find('\"') == -1:
+                quote = ord('\"')
+                result.append('"')
+            else:
+                quote = ord('\'')
+                result.append('\'')
+        else:
+            quote = 0
+
+            if size == 0:
+                return STR('')
+
+        pos = 0
+        while pos < size:
+            #oc = ORD(s, pos)
+            oc = utf8ord(s, pos)
+
+            # Escape quotes
+            if quotes and (oc == quote or oc == ord('\\')):
+                result.append('\\')
+                result.append(chr(oc))
+                pos += 1
+                continue
+
+            # Map special whitespace to '\t', \n', '\r'
+            if oc == ord('\t'):
+                result.append('\\t')
+            elif oc == ord('\n'):
+                result.append('\\n')
+            elif oc == ord('\r'):
+                result.append('\\r')
+            elif oc == ord('\\'):
+                result.append('\\\\')
+
+            # Map non-printable or non-ascii to '\xhh' or '\uhhhh'
+            elif pass_printable and not unicodedb.isprintable(oc):
+                char_escape_helper(result, oc)
+            elif not pass_printable and (oc < 32 or oc >= 0x7F):
+                char_escape_helper(result, oc)
+
+            # Copy everything else as-is
+            else:
+                # TODO: Is this safe? Will we only have ascii characters here?
+                result.append(chr(oc))
+            pos += 1
+
+        if quotes:
+            result.append(chr(quote))
+        return result.build()
+
+    def char_escape_helper(result, char):
+        num = hex(char)
+        if char >= 0x10000:
+            result.append("\\U")
+            zeros = 8
+        elif char >= 0x100:
+            result.append("\\u")
+            zeros = 4
+        else:
+            result.append("\\x")
+            zeros = 2
+        lnum = len(num)
+        nb = zeros + 2 - lnum # num starts with '0x'
+        if nb > 0:
+            result.append_multiple_char('0', nb)
+        result.append_slice(num, 2, lnum)
+
+    return unicode_escape, char_escape_helper
+
+# This function is also used by _codecs/interp_codecs.py
+(unicode_encode_unicode_escape, raw_unicode_escape_helper
+ ) = make_unicode_escape_function()
+
+
+# }}}
+
+# ____________________________________________________________
+# Raw unicode escape {{{
+
+def str_decode_raw_unicode_escape(s, size, errors, final=False,
+                                  errorhandler=None):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    if size == 0:
+        # TODO:?
+        return Utf8Str('', True), 0
+
+    result = Utf8Builder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            result.append(ch)
+            pos += 1
+            continue
+
+        # \u-escapes are only interpreted iff the number of leading
+        # backslashes is odd
+        bs = pos
+        while pos < size:
+            pos += 1
+            if pos == size or s[pos] != '\\':
+                break
+            result.append('\\')
+
+        # we have a backslash at the end of the string, stop here
+        if pos >= size:
+            result.append('\\')
+            break
+
+        if ((pos - bs) & 1 == 0 or
+            pos >= size or
+            (s[pos] != 'u' and s[pos] != 'U')):
+            result.append('\\')
+            result.append(s[pos])
+            pos += 1
+            continue
+
+        digits = 4 if s[pos] == 'u' else 8
+        message = "truncated \\uXXXX"
+        pos += 1
+        pos = hexescape(result, s, pos, digits,
+                        "rawunicodeescape", errorhandler, message, errors)
+
+    return result.build(), pos
+
+def unicode_encode_raw_unicode_escape(s, size, errors, errorhandler=None):
+    # errorhandler is not used: this function cannot cause Unicode errors
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        oc = utf8ord(s, pos)
+
+        if oc < 0x100:
+            result.append(chr(oc))
+        else:
+            raw_unicode_escape_helper(result, oc)
+        pos += 1
+
+    return result.build()
+
+# }}}
+
+# ____________________________________________________________
+# ascii & latin-1 {{{
+
+def str_decode_latin_1(s, size, errors, final=False,
+                       errorhandler=None):
+    # latin1 is equivalent to the first 256 ordinals in Unicode.
+    pos = 0
+    result = Utf8Builder(size)
+    while pos < size:
+        result.append(ord(s[pos]))
+        pos += 1
+    return result.build(), pos
+
+
+# Specialize on the errorhandler when it's a constant
+ at specialize.arg_or_var(4)
+def str_decode_ascii(s, size, errors, final=False,
+                     errorhandler=None):
+    # TODO: Is it worth while to try to avoid the making copy by first checking
+    #       the string for errors?
+
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    # ASCII is equivalent to the first 128 ordinals in Unicode.
+    result = Utf8Builder(size)
+    pos = 0
+    while pos < size:
+        c = s[pos]
+        if ord(c) < 128:
+            result.append(c)
+            pos += 1
+        else:
+            r, pos = errorhandler(errors, "ascii", "ordinal not in range(128)",
+                                  s,  pos, pos + 1)
+            result.append(r)
+    return result.build(), pos
+
+
+# Specialize on the errorhandler when it's a constant
+ at specialize.arg_or_var(3)
+def unicode_encode_ucs1_helper(p, size, errors,
+                               errorhandler=None, limit=256):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+    if limit == 256:
+        reason = "ordinal not in range(256)"
+        encoding = "latin-1"
+    else:
+        reason = "ordinal not in range(128)"
+        encoding = "ascii"
+
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        od = utf8ord(p, pos)
+
+        if od < limit:
+            result.append(chr(od))
+            pos += 1
+        else:
+            # startpos for collecting unencodable chars
+            collstart = pos
+            collend = pos+1
+            while collend < len(p) and utf8ord(p, collend) >= limit:
+                collend += 1
+            ru, rs, pos = errorhandler(errors, encoding, reason, p,
+                                       collstart, collend)
+            if rs is not None:
+                # py3k only
+                result.append(rs)
+                continue
+            for ch in ru:
+                if ord(ch) < limit:
+                    result.append(chr(ord(ch)))
+                else:
+                    errorhandler("strict", encoding, reason, p,
+                                 collstart, collend)
+
+    return result.build()
+
+def unicode_encode_latin_1(p, size, errors, errorhandler=None):
+    res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 256)
+    return res
+
+def unicode_encode_ascii(p, size, errors, errorhandler=None):
+    res = unicode_encode_ucs1_helper(p, size, errors, errorhandler, 128)
+    return res
+
+# }}}
+
+# ____________________________________________________________
+# utf-8 {{{
+
+# Converting bytes (utf8) to unicode?
+# I guess we just make sure we're looking at valid utf-8 and then make the
+# object?
+
+def unicode_encode_utf_8(s, size, errors, errorhandler=None,
+                         allow_surrogates=False):
+    if size < len(s):
+        return s.bytes[0:s.index_of_char(size)]
+    return s.bytes
+
+def str_decode_utf_8(s, size, errors, final=False,
+                     errorhandler=None, allow_surrogates=False):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    result = Utf8Builder(size)
+    pos = str_decode_utf_8_impl(s, size, errors, final, errorhandler, result,
+                                allow_surrogates=allow_surrogates)
+    return result.build(), pos
+
+def str_decode_utf_8_impl(s, size, errors, final, errorhandler, result,
+                      allow_surrogates):
+    if size == 0:
+        return 0
+
+    # TODO: Instead of assembling and then re-disassembling the codepoints,
+    #       just use builder.append_slice
+    pos = 0
+    while pos < size:
+        ordch1 = ord(s[pos])
+        # fast path for ASCII
+        # XXX maybe use a while loop here
+        if ordch1 < 0x80:
+            result.append(ordch1)
+            pos += 1
+            continue
+
+        n = utf8_code_length[ordch1]
+        if pos + n > size:
+            if not final:
+                break
+            charsleft = size - pos - 1 # either 0, 1, 2
+            # note: when we get the 'unexpected end of data' we don't care
+            # about the pos anymore and we just ignore the value
+            if not charsleft:
+                # there's only the start byte and nothing else
+                r, pos = errorhandler(errors, 'utf8',
+                                      'unexpected end of data',
+                                      s, pos, pos+1)
+                result.append(r)
+                break
+            ordch2 = ord(s[pos+1])
+            if n == 3:
+                # 3-bytes seq with only a continuation byte
+                if (ordch2>>6 != 0x2 or   # 0b10
+                    (ordch1 == 0xe0 and ordch2 < 0xa0)):
+                    # or (ordch1 == 0xed and ordch2 > 0x9f)
+                    # second byte invalid, take the first and continue
+                    r, pos = errorhandler(errors, 'utf8',
+                                          'invalid continuation byte',
+                                          s, pos, pos+1)
+                    result.append(r)
+                    continue
+                else:
+                    # second byte valid, but third byte missing
+                    r, pos = errorhandler(errors, 'utf8',
+                                      'unexpected end of data',
+                                      s, pos, pos+2)
+                    result.append(r)
+                    break
+            elif n == 4:
+                # 4-bytes seq with 1 or 2 continuation bytes
+                if (ordch2>>6 != 0x2 or    # 0b10
+                    (ordch1 == 0xf0 and ordch2 < 0x90) or
+                    (ordch1 == 0xf4 and ordch2 > 0x8f)):
+                    # second byte invalid, take the first and continue
+                    r, pos = errorhandler(errors, 'utf8',
+                                          'invalid continuation byte',
+                                          s, pos, pos+1)
+                    result.append(r)
+                    continue
+                elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2:   # 0b10
+                    # third byte invalid, take the first two and continue
+                    r, pos = errorhandler(errors, 'utf8',
+                                          'invalid continuation byte',
+                                          s, pos, pos+2)
+                    result.append(r)
+                    continue
+                else:
+                    # there's only 1 or 2 valid cb, but the others are missing
+                    r, pos = errorhandler(errors, 'utf8',
+                                      'unexpected end of data',
+                                      s, pos, pos+charsleft+1)
+                    result.append(r)
+                    break
+
+        if n == 0:
+            r, pos = errorhandler(errors, 'utf8',
+                                  'invalid start byte',
+                                  s, pos, pos+1)
+            result.append(r)
+
+        elif n == 1:
+            assert 0, "ascii should have gone through the fast path"
+
+        elif n == 2:
+            ordch2 = ord(s[pos+1])
+            if ordch2>>6 != 0x2:   # 0b10
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+1)
+                result.append(r)
+                continue
+            # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+            result.append(((ordch1 & 0x1F) << 6) +    # 0b00011111
+                           (ordch2 & 0x3F))           # 0b00111111
+            pos += 2
+
+        elif n == 3:
+            ordch2 = ord(s[pos+1])
+            ordch3 = ord(s[pos+2])
+            if (ordch2>>6 != 0x2 or    # 0b10
+                (ordch1 == 0xe0 and ordch2 < 0xa0)
+                # surrogates shouldn't be valid UTF-8!
+                or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
+                ):
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+1)
+                result.append(r)
+                continue
+            elif ordch3>>6 != 0x2:     # 0b10
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+2)
+                result.append(r)
+                continue
+            # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+            result.append((((ordch1 & 0x0F) << 12) +     # 0b00001111
+                           ((ordch2 & 0x3F) << 6) +      # 0b00111111
+                            (ordch3 & 0x3F)))            # 0b00111111
+            pos += 3
+
+        elif n == 4:
+            ordch2 = ord(s[pos+1])
+            ordch3 = ord(s[pos+2])
+            ordch4 = ord(s[pos+3])
+            if (ordch2>>6 != 0x2 or     # 0b10
+                (ordch1 == 0xf0 and ordch2 < 0x90) or
+                (ordch1 == 0xf4 and ordch2 > 0x8f)):
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+1)
+                result.append(r)
+                continue
+            elif ordch3>>6 != 0x2:     # 0b10
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+2)
+                result.append(r)
+                continue
+            elif ordch4>>6 != 0x2:     # 0b10
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+3)
+                result.append(r)
+                continue
+            # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+            c = (((ordch1 & 0x07) << 18) +      # 0b00000111
+                 ((ordch2 & 0x3F) << 12) +      # 0b00111111
+                 ((ordch3 & 0x3F) << 6) +       # 0b00111111
+                 (ordch4 & 0x3F))               # 0b00111111
+
+            # TODO: Why doesn't this raise an error when c > MAXUNICODE? If I'm
+            #       converting utf8 -> utf8 is this necessary
+            if c <= MAXUNICODE:
+                result.append(c)
+            pos += 4
+
+    return pos
+
+# }}}
+
+# ____________________________________________________________
+# utf-16 {{{
+
+def str_decode_utf_16(s, size, errors, final=True,
+                      errorhandler=None):
+    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
+                                                         errorhandler, "native")
+    return result, length
+
+def str_decode_utf_16_be(s, size, errors, final=True,
+                         errorhandler=None):
+    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
+                                                         errorhandler, "big")
+    return result, length


More information about the pypy-commit mailing list