[pypy-commit] pypy utf8-unicode2: WIP. Most codec and unicodeobject tests pass now

waedt noreply at buildbot.pypy.org
Mon Jun 30 17:32:09 CEST 2014


Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72291:927fb84a5116
Date: 2014-06-30 10:14 -0500
http://bitbucket.org/pypy/pypy/changeset/927fb84a5116/

Log:	WIP. Most codec and unicodeobject tests pass now

diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -13,6 +13,7 @@
 from pypy.interpreter.astcompiler.consts import (
     CO_OPTIMIZED, CO_NEWLOCALS, CO_VARARGS, CO_VARKEYWORDS, CO_NESTED,
     CO_GENERATOR, CO_KILL_DOCSTRING, CO_YIELD_INSIDE_TRY)
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.tool.stdlib_opcode import opcodedesc, HAVE_ARGUMENT
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib.objectmodel import compute_hash
@@ -150,6 +151,8 @@
         for const in code.co_consts:
             if isinstance(const, types.CodeType): # from stable compiler
                 const = code_hook(space, const, hidden_applevel, code_hook)
+            if isinstance(const, unicode):
+                const = Utf8Str.from_unicode(const)
             newconsts_w[num] = space.wrap(const)
             num += 1
         # stick the underlying CPython magic value, if the code object
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -1,3 +1,7 @@
+# -*- coding: utf-8 -*-
+
+import py
+import sys
 from pypy.interpreter.utf8 import (
     Utf8Str, Utf8Builder, utf8chr, utf8ord)
 
@@ -20,6 +24,39 @@
                                 0xF0, 0x9F, 0x98, 0xBD,
                             ]]
 
+def test_iterator():
+    s = build_utf8str()
+    iter = s.codepoint_iter()
+    assert iter.peek_next() == 0x41
+    assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D]
+
+    for i in range(1, 5):
+        iter = s.codepoint_iter()
+        iter.move(i)
+        if i != 4:
+            assert iter.peek_next() == [0x41, 0x10F, 0x20AC, 0x1F63D][i]
+        assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
+
+    for i in range(1, 5):
+        iter = s.codepoint_iter()
+        list(iter) # move the iterator to the end
+        iter.move(-i)
+        assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
+
+    iter = s.char_iter()
+    l = [s.bytes.decode('utf8') for s in list(iter)]
+    if sys.maxunicode < 65536:
+        assert l[:3] == [u'A', u'\u010F', u'\u20AC']
+    else:
+        assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D']
+
+def test_builder_append_slice():
+    builder = Utf8Builder()
+    builder.append_slice(Utf8Str.from_unicode(u"0ê0"), 1, 2)
+    builder.append_slice("Test", 1, 3)
+
+    assert builder.build() == u"êes"
+
 def test_unicode_literal_comparison():
     builder = Utf8Builder()
     builder.append(0x10F)
@@ -55,9 +92,65 @@
     assert s[-1] == utf8chr(0x1F63D)
     assert s[-2] == utf8chr(0x20AC)
 
+    with py.test.raises(IndexError):
+        c = s[4]
+
 def test_getslice():
     s = build_utf8str()
 
     assert s[0:1] == u'A'
     assert s[0:2] == u'A\u010F'
     assert s[1:2] == u'\u010F'
+
+def test_convert_indices():
+    s = build_utf8str()
+
+    assert s.index_of_char(0) == 0
+    assert s.index_of_char(1) == 1
+    assert s.index_of_char(2) == 3
+    assert s.index_of_char(3) == 6
+
+    for i in range(len(s)):
+        assert s.char_index_of_byte(s.index_of_char(i)) == i
+
+def test_join():
+    s = Utf8Str(' ')
+    assert s.join([]) == u''
+
+    
+    assert s.join([Utf8Str('one')]) == u'one'
+    assert s.join([Utf8Str('one'), Utf8Str('two')]) == u'one two'
+
+def test_find():
+    u = u"äëïöü"
+    s = Utf8Str.from_unicode(u)
+
+    for c in u:
+        assert s.find(Utf8Str.from_unicode(u)) == u.find(u)
+        assert s.rfind(Utf8Str.from_unicode(u)) == u.rfind(u)
+
+    assert s.find('') == u.find('')
+    assert s.rfind('') == u.rfind('')
+
+    assert s.find('1') == u.find('1')
+    assert s.rfind('1') == u.rfind('1')
+
+    assert Utf8Str.from_unicode(u'abcdefghiabc').rfind(u'') == 12
+
+def test_count():
+    u = u"12äëïöü223"
+    s = Utf8Str.from_unicode(u)
+
+    assert s.count("1") == u.count("1")
+    assert s.count("2") == u.count("2")
+    assert s.count(Utf8Str.from_unicode(u"ä")) == u.count(u"ä")
+
+def test_split():
+    # U+00A0 is a non-breaking space
+    u = u"one two three\xA0four"
+    s = Utf8Str.from_unicode(u)
+
+    assert s.split() == u.split()
+    assert s.split(' ') == u.split(' ')
+    assert s.split(maxsplit=1) == u.split(None, 1)
+    assert s.split('\n') == [s]
diff --git a/pypy/interpreter/test/test_utf8_codecs.py b/pypy/interpreter/test/test_utf8_codecs.py
--- a/pypy/interpreter/test/test_utf8_codecs.py
+++ b/pypy/interpreter/test/test_utf8_codecs.py
@@ -6,19 +6,6 @@
 from pypy.interpreter.utf8 import Utf8Str
 from pypy.interpreter import utf8_codecs
 
-'''
-try:
-    import signal
-except ImportError:
-    pass
-else:
-    class MyKeyboardInterrupt(BaseException):
-        pass
-    def _interrupt(*args):
-        __tracebackhide__ = True
-        raise MyKeyboardInterrupt
-    signal.signal(signal.SIGINT, _interrupt)
-'''
 
 class UnicodeTests(object):
     def typeequals(self, x, y):
@@ -697,19 +684,13 @@
         for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
             self.checkencode(s, "utf-8")
 
-    # TODO: Is this test useful?
     def test_utf8_surrogates(self):
         # make sure that the string itself is not marshalled
         u = u"\ud800"
         for i in range(4):
             u += u"\udc00"
 
-        if utf8_codecs.MAXUNICODE < 65536:
-            # Check replacing of two surrogates by single char while encoding
-            self.checkencode(u, "utf-8")
-        else:
-            # This is not done in wide unicode builds
-            py.test.raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
+        py.test.raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
 
     def test_ascii_error(self):
         self.checkencodeerror(
@@ -780,13 +761,13 @@
         u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
         if runicode.MAXUNICODE < 65536:
             # Narrow unicode build, consider utf16 surrogate pairs
-            assert runicode.unicode_encode_unicode_escape(
+            assert utf8_codecs.unicode_encode_unicode_escape(
                 u, len(u), True) == r'\U00010000'
-            assert runicode.unicode_encode_raw_unicode_escape(
+            assert utf8_codecs.unicode_encode_raw_unicode_escape(
                 u, len(u), True) == r'\U00010000'
         else:
             # Wide unicode build, don't merge utf16 surrogate pairs
-            assert runicode.unicode_encode_unicode_escape(
+            assert utf8_codecs.unicode_encode_unicode_escape(
                 u, len(u), True) == r'\ud800\udc00'
-            assert runicode.unicode_encode_raw_unicode_escape(
+            assert utf8_codecs.unicode_encode_raw_unicode_escape(
                 u, len(u), True) == r'\ud800\udc00'
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,6 +1,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.runicode import utf8_code_length
+from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
 from rpython.rlib.rarithmetic import r_uint
 
 def utf8chr(value):
@@ -9,9 +10,7 @@
     b.append(value)
     return b.build()
 
-def utf8ord(ustr, start=0):
-    bytes = ustr.bytes
-    start = ustr.index_of_char(start)
+def utf8ord_bytes(bytes, start):
     codepoint_length = utf8_code_length[ord(bytes[start])]
 
     if codepoint_length == 1:
@@ -31,6 +30,16 @@
                 (ord(bytes[start + 2]) & 0x3F) << 6 |
                 (ord(bytes[start + 3]) & 0x3F))
 
+def utf8ord(ustr, start=0):
+    start = ustr.index_of_char(start)
+    return utf8ord_bytes(ustr.bytes, start)
+
+ at specialize.argtype(0)
+def ORD(s, pos):
+    if isinstance(s, Utf8Str):
+        return utf8ord(s, pos)
+    else:
+        return ord(s[pos])
 
 class Utf8Str(object):
     _immutable_fields_ = ['bytes', '_is_ascii', '_len']
@@ -72,14 +81,27 @@
 
         return byte
 
+    def char_index_of_byte(self, byte_):
+        byte = 0
+        pos = 0
+        while byte < byte_:
+            pos += 1
+            byte += utf8_code_length[ord(self.bytes[byte])]
+
+        return pos
+
     def __getitem__(self, char_pos):
         # This if statement is needed for [-1:0] to slice correctly
+        if char_pos >= self._len:
+            raise IndexError()
         if char_pos < 0:
             char_pos += self._len
         return self[char_pos:char_pos+1]
 
     def __getslice__(self, start, stop):
-        assert start < stop
+        assert start <= stop
+        if start == stop:
+            return Utf8Str('')
         # TODO: If start > _len or stop >= _len, then raise exception 
 
         if self._is_ascii:
@@ -102,6 +124,13 @@
         return Utf8Str(self.bytes[start_byte:stop_byte], is_ascii,
                        stop - start)
 
+    def __add__(self, other):
+        return Utf8Str(self.bytes + other.bytes,
+                       self._is_ascii and other._is_ascii)
+
+    def __mul__(self, count):
+        return Utf8Str(self.bytes * count, self._is_ascii)
+
     def __len__(self):
         return self._len
 
@@ -127,32 +156,162 @@
         raise TypeError()
 
     def __iter__(self):
-        byte_pos = 0
-        while byte_pos < len(self.bytes):
-            cplen = utf8_code_length[ord(self.bytes[byte_pos])]
-            yield Utf8Str(self.bytes[byte_pos:byte_pos+cplen])
-            byte_pos += cplen
+        return self.char_iter()
 
-    @specialize.argtype(1)
-    def find(self, other):
+    def char_iter(self):
+        return Utf8StrCharIterator(self)
+
+    def codepoint_iter(self):
+        return Utf8StrCodePointIterator(self)
+
+    @specialize.argtype(1, 2)
+    def _bound_check(self, start, end):
+        if start is None:
+            start = 0
+        elif start < 0:
+            start += len(self)
+            if start < 0:
+                start = 0
+            else:
+                start = self.index_of_char(start)
+        elif start > len(self):
+            start = -1
+        else:
+            start = self.index_of_char(start)
+
+        if end is None or end >= len(self):
+            end = len(self.bytes)
+        elif end < 0:
+            end += len(self)
+            if end < 0:
+                end = 0
+            else:
+                end = self.index_of_char(end)
+        elif end > len(self):
+            end = len(self.bytes)
+        else:
+            end = self.index_of_char(end)
+
+        return start, end
+
+    @specialize.argtype(2, 3)
+    def find(self, other, start=None, end=None):
+        start, end = self._bound_check(start, end)
+        if start == -1:
+            return -1
+
         if isinstance(other, Utf8Str):
-            return self.bytes.find(other.bytes)
-        if isinstance(other, unicode):
-            return unicode(self.bytes, 'utf8').find(other)
-        if isinstance(other, str):
-            return self.bytes.find(other)
+            pos = self.bytes.find(other.bytes, start, end)
+        elif isinstance(other, unicode):
+            pos = unicode(self.bytes, 'utf8').find(other, start, end)
+        elif isinstance(other, str):
+            pos = self.bytes.find(other, start, end)
 
-    def rfind(self, other):
+        if pos == -1:
+            return -1
+
+        return self.char_index_of_byte(pos)
+
+    @specialize.argtype(2, 3)
+    def rfind(self, other, start=None, end=None):
+        start, end = self._bound_check(start, end)
+        if start == -1:
+            return -1
+
         if isinstance(other, Utf8Str):
-            return self.bytes.rfind(other.bytes)
-        if isinstance(other, unicode):
-            return unicode(self.bytes, 'utf8').rfind(other)
-        if isinstance(other, str):
-            return self.bytes.rfind(other)
+            pos = self.bytes.rfind(other.bytes, start, end)
+        elif isinstance(other, unicode):
+            return unicode(self.bytes, 'utf8').rfind(other, start, end)
+        elif isinstance(other, str):
+            pos = self.bytes.rfind(other, start, end)
+
+        if pos == -1:
+            return -1
+
+        return self.char_index_of_byte(pos)
+
+    @specialize.argtype(2, 3)
+    def count(self, other, start=None, end=None):
+        start, end = self._bound_check(start, end)
+        if start == -1:
+            return 0
+
+        if isinstance(other, Utf8Str):
+            count = self.bytes.count(other.bytes, start, end)
+        elif isinstance(other, unicode):
+            return unicode(self.bytes, 'utf8').count(other, start, end)
+        elif isinstance(other, str):
+            count = self.bytes.count(other, start, end)
+
+        if count == -1:
+            return -1
+
+        return count
 
     def endswith(self, other):
         return self.rfind(other) == len(self) - len(other)
 
+    @specialize.argtype(1)
+    def split(self, other=None, maxsplit=-1):
+        if other is not None:
+            if isinstance(other, str):
+                other_bytes = other
+            if isinstance(other, Utf8Str):
+                other_bytes = other.bytes
+            return [Utf8Str(s) for s in self.bytes.split(other_bytes, maxsplit)]
+
+        res = []
+        iter = self.codepoint_iter()
+        while True:
+            # the start of the first word
+            for cd in iter:
+                if not unicodedb.isspace(cd):
+                    break
+            else:
+                break
+
+            iter.prev_count(1)
+            start_byte = iter.byte_pos
+            iter.next_count(1)
+
+            if maxsplit == 0:
+                res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+                break
+
+            for cd in iter:
+                if unicodedb.isspace(cd):
+                    break
+            else:
+                # Hit the end of the string
+                res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+                break
+
+            iter.prev_count(1)
+            res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos]))
+            iter.next_count(1)
+            maxsplit -= 1
+
+        return res
+
+    @specialize.argtype(1)
+    def rsplit(self, other=None, maxsplit=-1):
+        if other is not None:
+            if isinstance(other, str):
+                other_bytes = other
+            if isinstance(other, Utf8Str):
+                other_bytes = other.bytes
+            return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes, maxsplit)]
+
+        # TODO: I need to make a reverse_codepoint_iter first
+
+    def join(self, other):
+        if len(other) == 0:
+            return Utf8Str('')
+
+        assert isinstance(other[0], Utf8Str)
+        return Utf8Str(self.bytes.join([s.bytes for s in other]),
+                       self._is_ascii and all(s._is_ascii for s in other))
+
     def as_unicode(self):
         """NOT_RPYTHON"""
         return self.bytes.decode('utf-8')
@@ -162,6 +321,84 @@
         """NOT_RPYTHON"""
         return Utf8Str(u.encode('utf-8'))
 
+class Utf8StrCodePointIterator(object):
+    def __init__(self, ustr):
+        self.ustr = ustr
+        self.pos = 0
+        self.byte_pos = 0
+
+        if len(ustr) != 0:
+            self.current = utf8ord_bytes(ustr.bytes, 0)
+        else:
+            self.current = -1
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.pos == len(self.ustr):
+            raise StopIteration()
+        self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+
+        self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+        self.pos += 1
+
+        return self.current
+
+    def next_count(self, count=1):
+        self.pos += count
+        while count > 1:
+            self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+            count -= 1
+        self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+        self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+
+    def prev_count(self, count=1):
+        self.pos -= count
+        while count > 0:
+            self.byte_pos -= 1
+            while utf8_code_length[ord(self.ustr.bytes[self.byte_pos])] == 0:
+                self.byte_pos -= 1
+            count -= 1
+
+        self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+
+    def move(self, count):
+        if count > 0:
+            self.next_count(count)
+        elif count < 0:
+            self.prev_count(-count)
+
+    def peek_next(self):
+        return utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+
+class Utf8StrCharIterator(object):
+    def __init__(self, ustr):
+        self.ustr = ustr
+        self.byte_pos = 0
+        self.current = self._get_current()
+
+    def __iter__(self):
+        return self
+
+    def _get_current(self):
+        if self.byte_pos == len(self.ustr.bytes):
+            return None
+        length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+        return Utf8Str(''.join([self.ustr.bytes[i]
+                        for i in range(self.byte_pos, self.byte_pos + length)]),
+                       length == 1)
+
+    def next(self):
+        #import pdb; pdb.set_trace()
+        ret = self.current
+        if ret is None:
+            raise StopIteration()
+
+        self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+        self.current = self._get_current()
+        return ret
+
 class Utf8Builder(object):
     @specialize.argtype(1)
     def __init__(self, init_size=None):
@@ -204,10 +441,19 @@
                 assert ord(c) < 128
             self._builder.append(c)
 
-    def append_slice(self, s, start, end, is_ascii=False):
-        self._builder.append_slice(s, start, end)
-        if not is_ascii:
-            self._is_ascii = False
+    @specialize.argtype(1)
+    def append_slice(self, s, start, end):
+        if isinstance(s, str):
+            self._builder.append_slice(s, start, end)
+        elif isinstance(s, Utf8Str):
+            self._builder.append_slice(s.bytes, s.index_of_char(start),
+                                       s.index_of_char(end))
+        else:
+            raise TypeError("Invalid type '%s' for Utf8Str.append_slice" %
+                            type(s))
+
+    def append_multiple_char(self, c, count):
+        self._builder.append_multiple_char(c, count)
 
     def build(self):
         return Utf8Str(self._builder.build(), self._is_ascii)
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -6,7 +6,7 @@
 from rpython.rlib.unicodedata import unicodedb
 from rpython.rlib.runicode import utf8_code_length
 
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord, ORD
 
 
 BYTEORDER = sys.byteorder
@@ -33,7 +33,7 @@
 
         # Non-escape characters are interpreted as Unicode ordinals
         if ch != '\\':
-            builder.append(ch)
+            builder.append(ord(ch))
             pos += 1
             continue
 
@@ -383,6 +383,8 @@
 @specialize.arg_or_var(3)
 def unicode_encode_ucs1_helper(p, size, errors,
                                errorhandler=None, limit=256):
+    if len(p) == 0:
+        return ''
     if errorhandler is None:
         errorhandler = default_unicode_error_encode
     if limit == 256:
@@ -415,8 +417,9 @@
                 result.append(rs)
                 continue
             for ch in ru:
-                if ord(ch) < limit:
-                    result.append(chr(ord(ch)))
+                cd = ORD(ch, 0)
+                if cd < limit:
+                    result.append(chr(cd))
                 else:
                     errorhandler("strict", encoding, reason, p,
                                  collstart, collend)
@@ -436,15 +439,60 @@
 # ____________________________________________________________
 # utf-8 {{{
 
-# Converting bytes (utf8) to unicode?
-# I guess we just make sure we're looking at valid utf-8 and then make the
-# object?
 
 def unicode_encode_utf_8(s, size, errors, errorhandler=None,
                          allow_surrogates=False):
-    if size < len(s):
-        return s.bytes[0:s.index_of_char(size)]
-    return s.bytes
+    if len(s) == 0:
+        return ''
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+
+    return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+                                     allow_surrogates)
+
+def unicode_encode_utf_8_impl(s, size, errors, errorhandler, allow_surrogates):
+    iter = s.codepoint_iter()
+    for oc in iter:
+        if oc >= 0xD800 and oc <= 0xDFFF:
+            break
+        if iter.pos == size:
+            return s.bytes
+    else:
+        return s.bytes
+
+    iter.move(-1)
+    result = Utf8Builder(len(s.bytes))
+    result.append_slice(s.bytes, 0, iter.byte_pos)
+
+    for oc in iter:
+        if oc >= 0xD800 and oc <= 0xDFFF:
+            # Check the next character to see if this is a surrogate pair
+            if (iter.pos != len(s) and oc <= 0xDBFF and
+                0xDC00 <= iter.peek_next() <= 0xDFFF):
+                oc2 = iter.next()
+                result.append(((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
+            elif allow_surrogates:
+                result.append(oc)
+            else:
+                ru, rs, pos = errorhandler(errors, 'utf8',
+                                        'surrogates not allowed', s,
+                                        iter.pos-1, iter.pos)
+                iter.move(pos - iter.pos)
+                if rs is not None:
+                    # py3k only
+                    result.append(rs)
+                    continue
+                for ch in ru:
+                    if ord(ch) < 0x80:
+                        result.append(ch)
+                    else:
+                        errorhandler('strict', 'utf8',
+                                    'surrogates not allowed',
+                                    s, pos-1, pos)
+        else:
+            result.append(oc)
+
+    return result.build().bytes
 
 def str_decode_utf_8(s, size, errors, final=False,
                      errorhandler=None, allow_surrogates=False):
@@ -1219,7 +1267,7 @@
 # ____________________________________________________________
 # Charmap {{{
 
-ERROR_CHAR = u'\ufffe'
+ERROR_CHAR = Utf8Str.from_unicode(u'\ufffe')
 
 @specialize.argtype(5)
 def str_decode_charmap(s, size, errors, final=False,
@@ -1296,84 +1344,16 @@
 
 def str_decode_unicode_internal(s, size, errors, final=False,
                                 errorhandler=None):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
-    if size == 0:
-        return u'', 0
-
-    if MAXUNICODE < 65536:
-        unicode_bytes = 2
+    if BYTEORDER == 'little':
+        return str_decode_utf_32_le(s, size, errors, errorhandler)
     else:
-        unicode_bytes = 4
-    if BYTEORDER == "little":
-        start = 0
-        stop = unicode_bytes
-        step = 1
-    else:
-        start = unicode_bytes - 1
-        stop = -1
-        step = -1
-
-    result = UnicodeBuilder(size // unicode_bytes)
-    pos = 0
-    while pos < size:
-        if pos > size - unicode_bytes:
-            res, pos = errorhandler(errors, "unicode_internal",
-                                    "truncated input",
-                                    s, pos, size)
-            result.append(res)
-            if pos > size - unicode_bytes:
-                break
-            continue
-        t = r_uint(0)
-        h = 0
-        for j in range(start, stop, step):
-            t += r_uint(ord(s[pos + j])) << (h*8)
-            h += 1
-        if t > MAXUNICODE:
-            res, pos = errorhandler(errors, "unicode_internal",
-                                    "unichr(%d) not in range" % (t,),
-                                    s, pos, pos + unicode_bytes)
-            result.append(res)
-            continue
-        result.append(UNICHR(t))
-        pos += unicode_bytes
-    return result.build(), pos
+        return str_decode_utf_32_be(s, size, errors, errorhandler)
 
 def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
-    if size == 0:
-        return ''
-
-    if MAXUNICODE < 65536:
-        unicode_bytes = 2
+    if BYTEORDER == 'little':
+        return unicode_encode_utf_32_le(s, size, errors, errorhandler)
     else:
-        unicode_bytes = 4
-
-    result = StringBuilder(size * unicode_bytes)
-    pos = 0
-    while pos < size:
-        oc = utf8ord(s, pos)
-        if MAXUNICODE < 65536:
-            if BYTEORDER == "little":
-                result.append(chr(oc       & 0xFF))
-                result.append(chr(oc >>  8 & 0xFF))
-            else:
-                result.append(chr(oc >>  8 & 0xFF))
-                result.append(chr(oc       & 0xFF))
-        else:
-            if BYTEORDER == "little":
-                result.append(chr(oc       & 0xFF))
-                result.append(chr(oc >>  8 & 0xFF))
-                result.append(chr(oc >> 16 & 0xFF))
-                result.append(chr(oc >> 24 & 0xFF))
-            else:
-                result.append(chr(oc >> 24 & 0xFF))
-                result.append(chr(oc >> 16 & 0xFF))
-                result.append(chr(oc >>  8 & 0xFF))
-                result.append(chr(oc       & 0xFF))
-        pos += 1
-
-    return result.build()
+        return unicode_encode_utf_32_be(s, size, errors, errorhandler)
 
 # }}}
 
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -3,6 +3,7 @@
 from rpython.rlib.rstring import UnicodeBuilder
 from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
 
+from pypy.interpreter.utf8 import Utf8Builder, Utf8Str, utf8chr, utf8ord
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 
@@ -206,13 +207,13 @@
     w_end = space.getattr(w_exc, space.wrap('end'))
     size = space.int_w(w_end) - space.int_w(w_start)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        text = u'?' * size
+        text = Utf8Str('?' * size, True)
         return space.newtuple([space.wrap(text), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
-        text = u'\ufffd'
+        text = utf8chr(0xfffd)
         return space.newtuple([space.wrap(text), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
-        text = u'\ufffd' * size
+        text = utf8chr(0xfffd) * size
         return space.newtuple([space.wrap(text), w_end])
     else:
         raise oefmt(space.w_TypeError,
@@ -251,25 +252,26 @@
         start = space.int_w(space.getattr(w_exc, space.wrap('start')))
         w_end = space.getattr(w_exc, space.wrap('end'))
         end = space.int_w(w_end)
-        builder = UnicodeBuilder()
+
+        builder = Utf8Builder()
         pos = start
         while pos < end:
-            oc = ord(obj[pos])
+            oc = utf8ord(obj, pos)
             num = hex(oc)
             if (oc >= 0x10000):
-                builder.append(u"\\U")
+                builder.append("\\U")
                 zeros = 8
             elif (oc >= 0x100):
-                builder.append(u"\\u")
+                builder.append("\\u")
                 zeros = 4
             else:
-                builder.append(u"\\x")
+                builder.append("\\x")
                 zeros = 2
             lnum = len(num)
             nb = zeros + 2 - lnum # num starts with '0x'
             if nb > 0:
                 builder.append_multiple_char(u'0', nb)
-            builder.append_slice(unicode(num), 2, lnum)
+            builder.append_slice(num, 2, lnum)
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
     else:
@@ -378,7 +380,6 @@
 # ____________________________________________________________
 # delegation to runicode
 
-#from rpython.rlib import runicode
 from pypy.interpreter import utf8_codecs
 
 def make_encoder_wrapper(name):
@@ -548,7 +549,7 @@
             if not 0 <= x <= 0x10FFFF:
                 raise oefmt(space.w_TypeError,
                     "character mapping must be in range(0x110000)")
-            return code_to_unichr(x)
+            return utf8chr(x)
         elif space.is_w(w_ch, space.w_None):
             # Charmap may return None
             return errorchar
@@ -566,7 +567,7 @@
 
         # get the character from the mapping
         try:
-            w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+            w_ch = space.getitem(self.w_mapping, space.newint(utf8ord(ch)))
         except OperationError, e:
             if not e.match(space, space.w_LookupError):
                 raise
@@ -595,7 +596,7 @@
     if errors is None:
         errors = 'strict'
     if len(string) == 0:
-        return space.newtuple([space.wrap(u''), space.wrap(0)])
+        return space.newtuple([space.wrap(Utf8Str('')), space.wrap(0)])
 
     if space.is_none(w_mapping):
         mapping = None
@@ -631,7 +632,7 @@
     w_charmap = space.newdict()
     for num in range(len(chars)):
         elem = chars[num]
-        space.setitem(w_charmap, space.newint(ord(elem)), space.newint(num))
+        space.setitem(w_charmap, space.newint(utf8ord(elem)), space.newint(num))
     return w_charmap
 
 # ____________________________________________________________
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -10,6 +10,7 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import (
     WrappedDefault, interp2app, interpindirect2app, unwrap_spec)
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.objspace.std import newformat
 from pypy.objspace.std.basestringtype import basestring_typedef
 from pypy.objspace.std.formatting import mod_format
@@ -715,11 +716,11 @@
             sub = self_as_uni._op_val(space, w_old)
             by = self_as_uni._op_val(space, w_new)
             try:
-                res = replace(input, sub, by, count)
+                res = replace(input.bytes, sub.bytes, by.bytes, count)
             except OverflowError:
                 raise oefmt(space.w_OverflowError,
                             "replace string is too long")
-            return self_as_uni._new(res)
+            return self_as_uni._new(Utf8Str(res))
         return self._StringMethods_descr_replace(space, w_old, w_new, count)
 
     _StringMethods_descr_join = descr_join
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -2,13 +2,14 @@
 String formatting routines.
 """
 import sys
-from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib import jit
 from rpython.rlib.rfloat import formatd, DTSF_ALT, isnan, isinf
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rlib.rarithmetic import INT_MAX
 from rpython.tool.sourcetools import func_with_new_name
+from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import Utf8Builder, ORD
 
 
 class BaseStringFormatter(object):
@@ -168,7 +169,7 @@
         def peekchr(self):
             # return the 'current' character
             try:
-                return self.fmt[self.fmtpos]
+                return ORD(self.fmt, self.fmtpos)
             except IndexError:
                 space = self.space
                 raise OperationError(space.w_ValueError,
@@ -185,16 +186,16 @@
             pcount = 1
             while 1:
                 try:
-                    c = fmt[i]
+                    c = ORD(fmt, i)
                 except IndexError:
                     space = self.space
                     raise OperationError(space.w_ValueError,
                                          space.wrap("incomplete format key"))
-                if c == ')':
+                if c == ord(')'):
                     pcount -= 1
                     if pcount == 0:
                         break
-                elif c == '(':
+                elif c == ord('('):
                     pcount += 1
                 i += 1
             self.fmtpos = i + 1   # first character after ')'
@@ -210,7 +211,7 @@
             return space.getitem(self.w_valuedict, w_key)
 
         def parse_fmt(self):
-            if self.peekchr() == '(':
+            if self.peekchr() == ord('('):
                 w_value = self.getmappingvalue(self.getmappingkey())
             else:
                 w_value = None
@@ -223,7 +224,7 @@
                 self.f_ljust = True
                 self.width = -self.width
 
-            if self.peekchr() == '.':
+            if self.peekchr() == ord('.'):
                 self.forward()
                 self.prec = self.peel_num('prec', INT_MAX)
                 if self.prec < 0:
@@ -232,7 +233,7 @@
                 self.prec = -1
 
             c = self.peekchr()
-            if c == 'h' or c == 'l' or c == 'L':
+            if c == ord('h') or c == ord('l') or c == ord('L'):
                 self.forward()
 
             return w_value
@@ -247,15 +248,15 @@
             self.f_zero  = False
             while True:
                 c = self.peekchr()
-                if c == '-':
+                if c == ord('-'):
                     self.f_ljust = True
-                elif c == '+':
+                elif c == ord('+'):
                     self.f_sign = True
-                elif c == ' ':
+                elif c == ord(' '):
                     self.f_blank = True
-                elif c == '#':
+                elif c == ord('#'):
                     self.f_alt = True
-                elif c == '0':
+                elif c == ord('0'):
                     self.f_zero = True
                 else:
                     break
@@ -266,7 +267,7 @@
         def peel_num(self, name, maxval):
             space = self.space
             c = self.peekchr()
-            if c == '*':
+            if c == ord('*'):
                 self.forward()
                 w_value = self.nextinputvalue()
                 if name == 'width':
@@ -277,7 +278,7 @@
                     assert False
             result = 0
             while True:
-                digit = ord(c) - ord('0')
+                digit = c - ord('0')
                 if not (0 <= digit <= 9):
                     break
                 if result > (maxval - digit) / 10:
@@ -291,16 +292,17 @@
         def format(self):
             lgt = len(self.fmt) + 4 * len(self.values_w) + 10
             if do_unicode:
-                result = UnicodeBuilder(lgt)
+                result = Utf8Builder(lgt)
             else:
                 result = StringBuilder(lgt)
             self.result = result
+
             while True:
                 # fast path: consume as many characters as possible
                 fmt = self.fmt
                 i = i0 = self.fmtpos
                 while i < len(fmt):
-                    if fmt[i] == '%':
+                    if ORD(fmt, i) == ord('%'):
                         break
                     i += 1
                 else:
@@ -313,8 +315,8 @@
                 w_value = self.parse_fmt()
                 c = self.peekchr()
                 self.forward()
-                if c == '%':
-                    self.std_wp(const('%'))
+                if c == ord('%'):
+                    self.std_wp('%')
                     continue
                 if w_value is None:
                     w_value = self.nextinputvalue()
@@ -325,7 +327,7 @@
                     if c == c1:
                         # 'c1' is an annotation constant here,
                         # so this getattr() is ok
-                        do_fmt = getattr(self, 'fmt_' + c1)
+                        do_fmt = getattr(self, 'fmt_' + chr(c1))
                         do_fmt(w_value)
                         break
                 else:
@@ -348,7 +350,7 @@
             else:
                 s = c
             msg = "unsupported format character '%s' (0x%x) at index %d" % (
-                s, ord(c), self.fmtpos - 1)
+                s, ORD(c, 0), self.fmtpos - 1)
             raise OperationError(space.w_ValueError, space.wrap(msg))
 
         def std_wp(self, r):
@@ -359,7 +361,7 @@
             prec = self.prec
             if prec == -1 and self.width == 0:
                 # fast path
-                self.result.append(const(r))
+                self.result.append(r)
                 return
             if prec >= 0 and prec < length:
                 length = prec   # ignore the end of the string if too long
@@ -369,12 +371,12 @@
                 padding = 0
             assert padding >= 0
             if not self.f_ljust and padding > 0:
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
                 # add any padding at the left of 'r'
                 padding = 0
             result.append_slice(r, 0, length)       # add 'r' itself
             if padding > 0:
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
             # add any remaining padding at the right
         std_wp._annspecialcase_ = 'specialize:argtype(1)'
 
@@ -405,18 +407,19 @@
 
             assert padding >= 0
             if padnumber == '>':
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
                 # pad with spaces on the left
             if sign:
-                result.append(const(r[0]))        # the sign
-            result.append(const(prefix))               # the prefix
+                # TODO: Why r[0]?
+                result.append(r[0])        # the sign
+            result.append(prefix)               # the prefix
             if padnumber == '0':
-                result.append_multiple_char(const('0'), padding)
+                result.append_multiple_char('0', padding)
                 # pad with zeroes
-            result.append_slice(const(r), int(sign), len(r))
+            result.append_slice(r, int(sign), len(r))
             # the rest of the number
             if padnumber == '<':           # spaces on the right
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
 
         def string_formatting(self, w_value):
             space = self.space
@@ -499,7 +502,7 @@
 # an "unrolling" list of all the known format characters,
 # collected from which fmt_X() functions are defined in the class
 FORMATTER_CHARS = unrolling_iterable(
-    [_name[-1] for _name in dir(StringFormatter)
+    [ord(_name[-1]) for _name in dir(StringFormatter)
                if len(_name) == 5 and _name.startswith('fmt_')])
 
 def format(space, w_fmt, values_w, w_valuedict, do_unicode):
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -4,6 +4,7 @@
 import string
 
 from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD
 from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rfloat import copysign, formatd
@@ -47,7 +48,7 @@
         def __init__(self, space, is_unicode, template):
             self.space = space
             self.is_unicode = is_unicode
-            self.empty = u"" if is_unicode else ""
+            self.empty = Utf8Str("") if is_unicode else ""
             self.template = template
 
         def build(self, args):
@@ -59,7 +60,7 @@
         def _build_string(self, start, end, level):
             space = self.space
             if self.is_unicode:
-                out = rstring.UnicodeBuilder()
+                out = Utf8Builder()
             else:
                 out = rstring.StringBuilder()
             if not level:
@@ -74,23 +75,23 @@
             space = self.space
             last_literal = i = start
             while i < end:
-                c = s[i]
+                c = ORD(s, i)
                 i += 1
-                if c == "{" or c == "}":
+                if c == ord("{") or c == ord("}"):
                     at_end = i == end
                     # Find escaped "{" and "}"
                     markup_follows = True
-                    if c == "}":
-                        if at_end or s[i] != "}":
+                    if c == ord("}"):
+                        if at_end or ORD(s, i) != ord("}"):
                             raise OperationError(space.w_ValueError,
                                                  space.wrap("Single '}'"))
                         i += 1
                         markup_follows = False
-                    if c == "{":
+                    if c == ord("{"):
                         if at_end:
                             raise OperationError(space.w_ValueError,
                                                  space.wrap("Single '{'"))
-                        if s[i] == "{":
+                        if ORD(s, i) == ord("{"):
                             i += 1
                             markup_follows = False
                     # Attach literal data, ending with { or }
@@ -111,11 +112,11 @@
                     field_start = i
                     recursive = False
                     while i < end:
-                        c = s[i]
-                        if c == "{":
+                        c = ORD(s, i)
+                        if c == ord("{"):
                             recursive = True
                             nested += 1
-                        elif c == "}":
+                        elif c == ord("}"):
                             nested -= 1
                             if not nested:
                                 break
@@ -139,9 +140,9 @@
             i = start
             while i < end:
                 c = s[i]
-                if c == ":" or c == "!":
+                if c == ord(":") or c == ord("!"):
                     end_name = i
-                    if c == "!":
+                    if c == ord("!"):
                         i += 1
                         if i == end:
                             w_msg = self.space.wrap("expected conversion")
@@ -170,7 +171,7 @@
             end = len(name)
             while i < end:
                 c = name[i]
-                if c == "[" or c == ".":
+                if c == ord("[") or c == ord("."):
                     break
                 i += 1
             empty = not i
@@ -228,12 +229,12 @@
             i = start
             while i < end:
                 c = name[i]
-                if c == ".":
+                if c == ord("."):
                     i += 1
                     start = i
                     while i < end:
                         c = name[i]
-                        if c == "[" or c == ".":
+                        if c == ord("[") or c == ord("."):
                             break
                         i += 1
                     if start == i:
@@ -245,13 +246,13 @@
                     else:
                         self.parser_list_w.append(space.newtuple([
                             space.w_True, w_attr]))
-                elif c == "[":
+                elif c == ord("["):
                     got_bracket = False
                     i += 1
                     start = i
                     while i < end:
                         c = name[i]
-                        if c == "]":
+                        if c == ord("]"):
                             got_bracket = True
                             break
                         i += 1
@@ -280,8 +281,8 @@
             i = 0
             end = len(name)
             while i < end:
-                c = name[i]
-                if c == "[" or c == ".":
+                c = ORD(name, i)
+                if c == ord("[") or c == ord("."):
                     break
                 i += 1
             if i == 0:
@@ -303,10 +304,10 @@
 
         def _convert(self, w_obj, conversion):
             space = self.space
-            conv = conversion[0]
-            if conv == "r":
+            conv = ORD(conversion, 0)
+            if conv == ord("r"):
                 return space.repr(w_obj)
-            elif conv == "s":
+            elif conv == ord("s"):
                 if self.is_unicode:
                     return space.call_function(space.w_unicode, w_obj)
                 return space.str(w_obj)
@@ -416,15 +417,15 @@
             self.spec = spec
 
         def _is_alignment(self, c):
-            return (c == "<" or
-                    c == ">" or
-                    c == "=" or
-                    c == "^")
+            return (c == ord("<") or
+                    c == ord(">") or
+                    c == ord("=") or
+                    c == ord("^"))
 
         def _is_sign(self, c):
-            return (c == " " or
-                    c == "+" or
-                    c == "-")
+            return (c == ord(" ") or
+                    c == ord("+") or
+                    c == ord("-"))
 
         def _parse_spec(self, default_type, default_align):
             space = self.space
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -9,6 +9,7 @@
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
+from pypy.interpreter.utf8 import ORD
 from pypy.objspace.std import slicetype
 from pypy.objspace.std.sliceobject import W_SliceObject, normalize_simple_slice
 
@@ -141,7 +142,7 @@
         if d > 0:
             offset = d//2 + (d & width & 1)
             fillchar = self._multi_chr(fillchar[0])
-            centered = offset * fillchar + value + (d - offset) * fillchar
+            centered = fillchar * offset + value + fillchar * (d - offset)
         else:
             centered = value
 
@@ -192,9 +193,9 @@
             return self._empty()
 
         if self._use_rstr_ops(space, self):
-            splitted = value.split(self._chr('\t'))
+            splitted = value.split('\t')
         else:
-            splitted = split(value, self._chr('\t'))
+            splitted = split(value, '\t')
 
         try:
             ovfcheck(len(splitted) * tabsize)
@@ -203,7 +204,7 @@
         expanded = oldtoken = splitted.pop(0)
 
         for token in splitted:
-            expanded += self._multi_chr(self._chr(' ')) * self._tabindent(oldtoken,
+            expanded += self._multi_chr(' ') * self._tabindent(oldtoken,
                                                          tabsize) + token
             oldtoken = token
 
@@ -218,7 +219,8 @@
             offset = len(token)
 
             while 1:
-                if token[offset-1] == "\n" or token[offset-1] == "\r":
+                if (ORD(token, offset-1) == ord("\n") or
+                    ORD(token, offset-1) == ord("\r")):
                     break
                 distance += 1
                 offset -= 1
@@ -455,7 +457,7 @@
         d = width - len(value)
         if d > 0:
             fillchar = self._multi_chr(fillchar[0])
-            value += d * fillchar
+            value += fillchar * d
 
         return self._new(value)
 
@@ -469,7 +471,7 @@
         d = width - len(value)
         if d > 0:
             fillchar = self._multi_chr(fillchar[0])
-            value = d * fillchar + value
+            value = fillchar * d + value
 
         return self._new(value)
 
@@ -558,31 +560,39 @@
         res = []
         value = self._val(space)
         if space.is_none(w_sep):
-            res = split(value, maxsplit=maxsplit)
+            res = self._split(value, None, maxsplit)
             return self._newlist_unwrapped(space, res)
 
         by = self._op_val(space, w_sep)
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
-        res = split(value, by, maxsplit)
+        res = self._split(value, by, maxsplit)
 
         return self._newlist_unwrapped(space, res)
 
+    @staticmethod
+    def _split(value, sep=None, maxsplit=-1):
+        return split(value, sep, maxsplit)
+
     @unwrap_spec(maxsplit=int)
     def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
         res = []
         value = self._val(space)
         if space.is_none(w_sep):
-            res = rsplit(value, maxsplit=maxsplit)
+            res = self._rsplit(value, maxsplit=maxsplit)
             return self._newlist_unwrapped(space, res)
 
         by = self._op_val(space, w_sep)
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
-        res = rsplit(value, by, maxsplit)
+        res = self._split(value, by, maxsplit)
 
         return self._newlist_unwrapped(space, res)
 
+    @staticmethod
+    def _rsplit(value, sep=None, maxsplit=-1):
+        return value.split(sep, maxsplit)
+
     @unwrap_spec(keepends=bool)
     def descr_splitlines(self, space, keepends=False):
         value = self._val(space)
@@ -757,20 +767,21 @@
     def descr_zfill(self, space, width):
         selfval = self._val(space)
         if len(selfval) == 0:
-            return self._new(self._multi_chr(self._chr('0')) * width)
+            return self._new(self._multi_chr('0') * width)
         num_zeros = width - len(selfval)
         if num_zeros <= 0:
             # cannot return self, in case it is a subclass of str
             return self._new(selfval)
 
         builder = self._builder(width)
-        if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'):
+        if len(selfval) > 0 and (ORD(selfval, 0) == ord('+') or
+                                 ORD(selfval, 0) == ord('-')):
             # copy sign to first position
             builder.append(selfval[0])
             start = 1
         else:
             start = 0
-        builder.append_multiple_char(self._chr('0'), num_zeros)
+        builder.append_multiple_char('0', num_zeros)
         builder.append_slice(selfval, start, len(selfval))
         return self._new(builder.build())
 
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -1,5 +1,6 @@
 import py
 import sys
+from pypy.interpreter.utf8 import Utf8Str
 
 
 class TestUnicodeObject:
@@ -22,12 +23,12 @@
         assert len(warnings) == 2
 
     def test_listview_unicode(self):
-        w_str = self.space.wrap(u'abcd')
+        w_str = self.space.wrap(Utf8Str.from_unicode(u'abcd'))
         assert self.space.listview_unicode(w_str) == list(u"abcd")
 
     def test_new_shortcut(self):
         space = self.space
-        w_uni = self.space.wrap(u'abcd')
+        w_uni = self.space.wrap(Utf8Str.from_unicode(u'abcd'))
         w_new = space.call_method(
                 space.w_unicode, "__new__", space.w_unicode, w_uni)
         assert w_new is w_uni
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1,13 +1,14 @@
 """The builtin unicode implementation"""
 
 from rpython.rlib.objectmodel import (
-    compute_hash, compute_unique_id, import_from_mixin)
+    compute_hash, compute_unique_id, import_from_mixin, specialize)
 from rpython.rlib.buffer import StringBuffer
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import (
+    StringBuilder, replace, startswith, endswith)
 
 from pypy.interpreter import unicodehelper
 from pypy.interpreter.baseobjspace import W_Root
-from pypy.interpreter.utf8 import Utf8Str, utf8chr, utf8ord
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
 from pypy.interpreter.utf8_codecs import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
     unicode_encode_ascii, unicode_encode_utf_8)
@@ -67,11 +68,12 @@
         return self._value
 
     def readbuf_w(self, space):
-        from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
-        builder = StringBuilder(len(self._value) * UNICODE_SIZE)
-        for unich in self._value:
-            pack_unichar(unich, builder)
-        return StringBuffer(builder.build())
+        return StringBuffer(self._value.bytes)
+        #from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
+        #builder = StringBuilder(len(self._value) * UNICODE_SIZE)
+        #for unich in self._value:
+        #    pack_unichar(unich, builder)
+        #return StringBuffer(builder.build())
 
     def writebuf_w(self, space):
         raise OperationError(space.w_TypeError, space.wrap(
@@ -87,7 +89,7 @@
             raise oefmt(space.w_TypeError,
                          "ord() expected a character, but string of length %d "
                          "found", len(self._value))
-        return space.wrap(utf8ord(self))
+        return space.wrap(utf8ord(self._value))
 
     def _new(self, value):
         return W_UnicodeObject(value)
@@ -120,9 +122,18 @@
 
     def _chr(self, char):
         assert len(char) == 1
-        return unicode(char)[0]
+        assert ord(char) < 127
+        return Utf8Str(char, True)
 
-    _builder = UnicodeBuilder
+    @specialize.argtype(1)
+    def _multi_chr(self, c):
+        if isinstance(c, str):
+            assert ord(c) < 127
+            return Utf8Str(c, True)
+        else:
+            return c
+
+    _builder = Utf8Builder
 
     def _isupper(self, ch):
         return unicodedb.isupper(utf8ord(ch))
@@ -158,13 +169,13 @@
         return unicodedb.islinebreak(utf8ord(ch))
 
     def _upper(self, ch):
-        return unichr(unicodedb.toupper(utf8ord(ch)))
+        return utf8chr(unicodedb.toupper(utf8ord(ch)))
 
     def _lower(self, ch):
-        return unichr(unicodedb.tolower(utf8ord(ch)))
+        return utf8chr(unicodedb.tolower(utf8ord(ch)))
 
     def _title(self, ch):
-        return unichr(unicodedb.totitle(utf8ord(ch)))
+        return utf8chr(unicodedb.totitle(utf8ord(ch)))
 
     def _newlist_unwrapped(self, space, lst):
         return space.newlist_unicode(lst)
@@ -302,6 +313,35 @@
     def descr_mod(self, space, w_values):
         return mod_format(space, self, w_values, do_unicode=True)
 
+    @unwrap_spec(count=int)
+    def descr_replace(self, space, w_old, w_new, count=-1):
+        input = self._val(space)
+
+        sub = self._op_val(space, w_old)
+        by = self._op_val(space, w_new)
+        try:
+            res = replace(input.bytes, sub.bytes, by.bytes, count)
+        except OverflowError:
+            raise oefmt(space.w_OverflowError, "replace string is too long")
+
+        return self._new(Utf8Str(res))
+
+    def _startswith(self, space, value, w_prefix, start, end):
+        return startswith(value.bytes, self._op_val(space, w_prefix).bytes,
+                          start, end)
+
+    def _endswith(self, space, value, w_prefix, start, end):
+        return endswith(value.bytes, self._op_val(space, w_prefix).bytes,
+                        start, end)
+
+    @staticmethod
+    def _split(value, sep=None, maxsplit=-1):
+        return value.split(sep, maxsplit)
+
+    @staticmethod
+    def _rsplit(value, sep=None, maxsplit=-1):
+        return value.split(sep, maxsplit)
+
     def descr_translate(self, space, w_table):
         selfvalue = self._value
         w_sys = space.getbuiltinmodule('sys')
@@ -313,7 +353,7 @@
                 w_newval = space.getitem(w_table, space.wrap(utf8ord(unichar)))
             except OperationError as e:
                 if e.match(space, space.w_LookupError):
-                    result.append(unichar)
+                    result.append(unichar.bytes)
                 else:
                     raise
             else:
@@ -325,14 +365,14 @@
                         raise oefmt(space.w_TypeError,
                                     "character mapping must be in range(%s)",
                                     hex(maxunicode + 1))
-                    result.append(unichr(newval))
+                    result.append(utf8chr(newval).bytes)
                 elif space.isinstance_w(w_newval, space.w_unicode):
-                    result.append(space.unicode_w(w_newval))
+                    result.append(space.unicode_w(w_newval).bytes)
                 else:
                     raise oefmt(space.w_TypeError,
                                 "character mapping must return integer, None "
                                 "or unicode")
-        return W_UnicodeObject(u''.join(result))
+        return W_UnicodeObject(Utf8Str(''.join(result)))
 
     def descr_encode(self, space, w_encoding=None, w_errors=None):
         encoding, errors = _get_encoding_and_errors(space, w_encoding,
@@ -1090,7 +1130,7 @@
     digits = ['0', '1', '2', '3', '4',
               '5', '6', '7', '8', '9']
     for i in xrange(len(unistr)):
-        uchr = ord(unistr[i])
+        uchr = utf8ord(unistr, i)
         if unicodedb.isspace(uchr):
             result[i] = ' '
             continue


More information about the pypy-commit mailing list