[pypy-commit] pypy utf8-unicode2: WIP. Most codec and unicodeobject tests pass now
waedt
noreply at buildbot.pypy.org
Mon Jun 30 17:32:09 CEST 2014
Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72291:927fb84a5116
Date: 2014-06-30 10:14 -0500
http://bitbucket.org/pypy/pypy/changeset/927fb84a5116/
Log: WIP. Most codec and unicodeobject tests pass now
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -13,6 +13,7 @@
from pypy.interpreter.astcompiler.consts import (
CO_OPTIMIZED, CO_NEWLOCALS, CO_VARARGS, CO_VARKEYWORDS, CO_NESTED,
CO_GENERATOR, CO_KILL_DOCSTRING, CO_YIELD_INSIDE_TRY)
+from pypy.interpreter.utf8 import Utf8Str
from pypy.tool.stdlib_opcode import opcodedesc, HAVE_ARGUMENT
from rpython.rlib.rarithmetic import intmask
from rpython.rlib.objectmodel import compute_hash
@@ -150,6 +151,8 @@
for const in code.co_consts:
if isinstance(const, types.CodeType): # from stable compiler
const = code_hook(space, const, hidden_applevel, code_hook)
+ if isinstance(const, unicode):
+ const = Utf8Str.from_unicode(const)
newconsts_w[num] = space.wrap(const)
num += 1
# stick the underlying CPython magic value, if the code object
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -1,3 +1,7 @@
+# -*- coding: utf-8 -*-
+
+import py
+import sys
from pypy.interpreter.utf8 import (
Utf8Str, Utf8Builder, utf8chr, utf8ord)
@@ -20,6 +24,39 @@
0xF0, 0x9F, 0x98, 0xBD,
]]
+def test_iterator():
+ s = build_utf8str()
+ iter = s.codepoint_iter()
+ assert iter.peek_next() == 0x41
+ assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D]
+
+ for i in range(1, 5):
+ iter = s.codepoint_iter()
+ iter.move(i)
+ if i != 4:
+ assert iter.peek_next() == [0x41, 0x10F, 0x20AC, 0x1F63D][i]
+ assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
+
+ for i in range(1, 5):
+ iter = s.codepoint_iter()
+ list(iter) # move the iterator to the end
+ iter.move(-i)
+ assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
+
+ iter = s.char_iter()
+ l = [s.bytes.decode('utf8') for s in list(iter)]
+ if sys.maxunicode < 65536:
+ assert l[:3] == [u'A', u'\u010F', u'\u20AC']
+ else:
+ assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D']
+
+def test_builder_append_slice():
+ builder = Utf8Builder()
+ builder.append_slice(Utf8Str.from_unicode(u"0ê0"), 1, 2)
+ builder.append_slice("Test", 1, 3)
+
+ assert builder.build() == u"êes"
+
def test_unicode_literal_comparison():
builder = Utf8Builder()
builder.append(0x10F)
@@ -55,9 +92,65 @@
assert s[-1] == utf8chr(0x1F63D)
assert s[-2] == utf8chr(0x20AC)
+ with py.test.raises(IndexError):
+ c = s[4]
+
def test_getslice():
s = build_utf8str()
assert s[0:1] == u'A'
assert s[0:2] == u'A\u010F'
assert s[1:2] == u'\u010F'
+
+def test_convert_indices():
+ s = build_utf8str()
+
+ assert s.index_of_char(0) == 0
+ assert s.index_of_char(1) == 1
+ assert s.index_of_char(2) == 3
+ assert s.index_of_char(3) == 6
+
+ for i in range(len(s)):
+ assert s.char_index_of_byte(s.index_of_char(i)) == i
+
+def test_join():
+ s = Utf8Str(' ')
+ assert s.join([]) == u''
+
+
+ assert s.join([Utf8Str('one')]) == u'one'
+ assert s.join([Utf8Str('one'), Utf8Str('two')]) == u'one two'
+
+def test_find():
+ u = u"äëïöü"
+ s = Utf8Str.from_unicode(u)
+
+ for c in u:
+ assert s.find(Utf8Str.from_unicode(u)) == u.find(u)
+ assert s.rfind(Utf8Str.from_unicode(u)) == u.rfind(u)
+
+ assert s.find('') == u.find('')
+ assert s.rfind('') == u.rfind('')
+
+ assert s.find('1') == u.find('1')
+ assert s.rfind('1') == u.rfind('1')
+
+ assert Utf8Str.from_unicode(u'abcdefghiabc').rfind(u'') == 12
+
+def test_count():
+ u = u"12äëïöü223"
+ s = Utf8Str.from_unicode(u)
+
+ assert s.count("1") == u.count("1")
+ assert s.count("2") == u.count("2")
+ assert s.count(Utf8Str.from_unicode(u"ä")) == u.count(u"ä")
+
+def test_split():
+ # U+00A0 is a non-breaking space
+ u = u"one two three\xA0four"
+ s = Utf8Str.from_unicode(u)
+
+ assert s.split() == u.split()
+ assert s.split(' ') == u.split(' ')
+ assert s.split(maxsplit=1) == u.split(None, 1)
+ assert s.split('\n') == [s]
diff --git a/pypy/interpreter/test/test_utf8_codecs.py b/pypy/interpreter/test/test_utf8_codecs.py
--- a/pypy/interpreter/test/test_utf8_codecs.py
+++ b/pypy/interpreter/test/test_utf8_codecs.py
@@ -6,19 +6,6 @@
from pypy.interpreter.utf8 import Utf8Str
from pypy.interpreter import utf8_codecs
-'''
-try:
- import signal
-except ImportError:
- pass
-else:
- class MyKeyboardInterrupt(BaseException):
- pass
- def _interrupt(*args):
- __tracebackhide__ = True
- raise MyKeyboardInterrupt
- signal.signal(signal.SIGINT, _interrupt)
-'''
class UnicodeTests(object):
def typeequals(self, x, y):
@@ -697,19 +684,13 @@
for s in ["\xd7\x90", "\xd6\x96", "\xeb\x96\x95", "\xf0\x90\x91\x93"]:
self.checkencode(s, "utf-8")
- # TODO: Is this test useful?
def test_utf8_surrogates(self):
# make sure that the string itself is not marshalled
u = u"\ud800"
for i in range(4):
u += u"\udc00"
- if utf8_codecs.MAXUNICODE < 65536:
- # Check replacing of two surrogates by single char while encoding
- self.checkencode(u, "utf-8")
- else:
- # This is not done in wide unicode builds
- py.test.raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
+ py.test.raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
def test_ascii_error(self):
self.checkencodeerror(
@@ -780,13 +761,13 @@
u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
if runicode.MAXUNICODE < 65536:
# Narrow unicode build, consider utf16 surrogate pairs
- assert runicode.unicode_encode_unicode_escape(
+ assert utf8_codecs.unicode_encode_unicode_escape(
u, len(u), True) == r'\U00010000'
- assert runicode.unicode_encode_raw_unicode_escape(
+ assert utf8_codecs.unicode_encode_raw_unicode_escape(
u, len(u), True) == r'\U00010000'
else:
# Wide unicode build, don't merge utf16 surrogate pairs
- assert runicode.unicode_encode_unicode_escape(
+ assert utf8_codecs.unicode_encode_unicode_escape(
u, len(u), True) == r'\ud800\udc00'
- assert runicode.unicode_encode_raw_unicode_escape(
+ assert utf8_codecs.unicode_encode_raw_unicode_escape(
u, len(u), True) == r'\ud800\udc00'
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,6 +1,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.objectmodel import specialize
from rpython.rlib.runicode import utf8_code_length
+from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
from rpython.rlib.rarithmetic import r_uint
def utf8chr(value):
@@ -9,9 +10,7 @@
b.append(value)
return b.build()
-def utf8ord(ustr, start=0):
- bytes = ustr.bytes
- start = ustr.index_of_char(start)
+def utf8ord_bytes(bytes, start):
codepoint_length = utf8_code_length[ord(bytes[start])]
if codepoint_length == 1:
@@ -31,6 +30,16 @@
(ord(bytes[start + 2]) & 0x3F) << 6 |
(ord(bytes[start + 3]) & 0x3F))
+def utf8ord(ustr, start=0):
+ start = ustr.index_of_char(start)
+ return utf8ord_bytes(ustr.bytes, start)
+
+ at specialize.argtype(0)
+def ORD(s, pos):
+ if isinstance(s, Utf8Str):
+ return utf8ord(s, pos)
+ else:
+ return ord(s[pos])
class Utf8Str(object):
_immutable_fields_ = ['bytes', '_is_ascii', '_len']
@@ -72,14 +81,27 @@
return byte
+ def char_index_of_byte(self, byte_):
+ byte = 0
+ pos = 0
+ while byte < byte_:
+ pos += 1
+ byte += utf8_code_length[ord(self.bytes[byte])]
+
+ return pos
+
def __getitem__(self, char_pos):
# This if statement is needed for [-1:0] to slice correctly
+ if char_pos >= self._len:
+ raise IndexError()
if char_pos < 0:
char_pos += self._len
return self[char_pos:char_pos+1]
def __getslice__(self, start, stop):
- assert start < stop
+ assert start <= stop
+ if start == stop:
+ return Utf8Str('')
# TODO: If start > _len or stop >= _len, then raise exception
if self._is_ascii:
@@ -102,6 +124,13 @@
return Utf8Str(self.bytes[start_byte:stop_byte], is_ascii,
stop - start)
+ def __add__(self, other):
+ return Utf8Str(self.bytes + other.bytes,
+ self._is_ascii and other._is_ascii)
+
+ def __mul__(self, count):
+ return Utf8Str(self.bytes * count, self._is_ascii)
+
def __len__(self):
return self._len
@@ -127,32 +156,162 @@
raise TypeError()
def __iter__(self):
- byte_pos = 0
- while byte_pos < len(self.bytes):
- cplen = utf8_code_length[ord(self.bytes[byte_pos])]
- yield Utf8Str(self.bytes[byte_pos:byte_pos+cplen])
- byte_pos += cplen
+ return self.char_iter()
- @specialize.argtype(1)
- def find(self, other):
+ def char_iter(self):
+ return Utf8StrCharIterator(self)
+
+ def codepoint_iter(self):
+ return Utf8StrCodePointIterator(self)
+
+ @specialize.argtype(1, 2)
+ def _bound_check(self, start, end):
+ if start is None:
+ start = 0
+ elif start < 0:
+ start += len(self)
+ if start < 0:
+ start = 0
+ else:
+ start = self.index_of_char(start)
+ elif start > len(self):
+ start = -1
+ else:
+ start = self.index_of_char(start)
+
+ if end is None or end >= len(self):
+ end = len(self.bytes)
+ elif end < 0:
+ end += len(self)
+ if end < 0:
+ end = 0
+ else:
+ end = self.index_of_char(end)
+ elif end > len(self):
+ end = len(self.bytes)
+ else:
+ end = self.index_of_char(end)
+
+ return start, end
+
+ @specialize.argtype(2, 3)
+ def find(self, other, start=None, end=None):
+ start, end = self._bound_check(start, end)
+ if start == -1:
+ return -1
+
if isinstance(other, Utf8Str):
- return self.bytes.find(other.bytes)
- if isinstance(other, unicode):
- return unicode(self.bytes, 'utf8').find(other)
- if isinstance(other, str):
- return self.bytes.find(other)
+ pos = self.bytes.find(other.bytes, start, end)
+ elif isinstance(other, unicode):
+ pos = unicode(self.bytes, 'utf8').find(other, start, end)
+ elif isinstance(other, str):
+ pos = self.bytes.find(other, start, end)
- def rfind(self, other):
+ if pos == -1:
+ return -1
+
+ return self.char_index_of_byte(pos)
+
+ @specialize.argtype(2, 3)
+ def rfind(self, other, start=None, end=None):
+ start, end = self._bound_check(start, end)
+ if start == -1:
+ return -1
+
if isinstance(other, Utf8Str):
- return self.bytes.rfind(other.bytes)
- if isinstance(other, unicode):
- return unicode(self.bytes, 'utf8').rfind(other)
- if isinstance(other, str):
- return self.bytes.rfind(other)
+ pos = self.bytes.rfind(other.bytes, start, end)
+ elif isinstance(other, unicode):
+ return unicode(self.bytes, 'utf8').rfind(other, start, end)
+ elif isinstance(other, str):
+ pos = self.bytes.rfind(other, start, end)
+
+ if pos == -1:
+ return -1
+
+ return self.char_index_of_byte(pos)
+
+ @specialize.argtype(2, 3)
+ def count(self, other, start=None, end=None):
+ start, end = self._bound_check(start, end)
+ if start == -1:
+ return 0
+
+ if isinstance(other, Utf8Str):
+ count = self.bytes.count(other.bytes, start, end)
+ elif isinstance(other, unicode):
+ return unicode(self.bytes, 'utf8').count(other, start, end)
+ elif isinstance(other, str):
+ count = self.bytes.count(other, start, end)
+
+ if count == -1:
+ return -1
+
+ return count
def endswith(self, other):
return self.rfind(other) == len(self) - len(other)
+ @specialize.argtype(1)
+ def split(self, other=None, maxsplit=-1):
+ if other is not None:
+ if isinstance(other, str):
+ other_bytes = other
+ if isinstance(other, Utf8Str):
+ other_bytes = other.bytes
+ return [Utf8Str(s) for s in self.bytes.split(other_bytes, maxsplit)]
+
+ res = []
+ iter = self.codepoint_iter()
+ while True:
+ # the start of the first word
+ for cd in iter:
+ if not unicodedb.isspace(cd):
+ break
+ else:
+ break
+
+ iter.prev_count(1)
+ start_byte = iter.byte_pos
+ iter.next_count(1)
+
+ if maxsplit == 0:
+ res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+ break
+
+ for cd in iter:
+ if unicodedb.isspace(cd):
+ break
+ else:
+ # Hit the end of the string
+ res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+ break
+
+ iter.prev_count(1)
+ res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos]))
+ iter.next_count(1)
+ maxsplit -= 1
+
+ return res
+
+ @specialize.argtype(1)
+ def rsplit(self, other=None, maxsplit=-1):
+ if other is not None:
+ if isinstance(other, str):
+ other_bytes = other
+ if isinstance(other, Utf8Str):
+ other_bytes = other.bytes
+ return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes, maxsplit)]
+
+ # TODO: I need to make a reverse_codepoint_iter first
+
+ def join(self, other):
+ if len(other) == 0:
+ return Utf8Str('')
+
+ assert isinstance(other[0], Utf8Str)
+ return Utf8Str(self.bytes.join([s.bytes for s in other]),
+ self._is_ascii and all(s._is_ascii for s in other))
+
def as_unicode(self):
"""NOT_RPYTHON"""
return self.bytes.decode('utf-8')
@@ -162,6 +321,84 @@
"""NOT_RPYTHON"""
return Utf8Str(u.encode('utf-8'))
+class Utf8StrCodePointIterator(object):
+ def __init__(self, ustr):
+ self.ustr = ustr
+ self.pos = 0
+ self.byte_pos = 0
+
+ if len(ustr) != 0:
+ self.current = utf8ord_bytes(ustr.bytes, 0)
+ else:
+ self.current = -1
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ if self.pos == len(self.ustr):
+ raise StopIteration()
+ self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+
+ self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+ self.pos += 1
+
+ return self.current
+
+ def next_count(self, count=1):
+ self.pos += count
+ while count > 1:
+ self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+ count -= 1
+ self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+ self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+
+ def prev_count(self, count=1):
+ self.pos -= count
+ while count > 0:
+ self.byte_pos -= 1
+ while utf8_code_length[ord(self.ustr.bytes[self.byte_pos])] == 0:
+ self.byte_pos -= 1
+ count -= 1
+
+ self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+
+ def move(self, count):
+ if count > 0:
+ self.next_count(count)
+ elif count < 0:
+ self.prev_count(-count)
+
+ def peek_next(self):
+ return utf8ord_bytes(self.ustr.bytes, self.byte_pos)
+
+class Utf8StrCharIterator(object):
+ def __init__(self, ustr):
+ self.ustr = ustr
+ self.byte_pos = 0
+ self.current = self._get_current()
+
+ def __iter__(self):
+ return self
+
+ def _get_current(self):
+ if self.byte_pos == len(self.ustr.bytes):
+ return None
+ length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+ return Utf8Str(''.join([self.ustr.bytes[i]
+ for i in range(self.byte_pos, self.byte_pos + length)]),
+ length == 1)
+
+ def next(self):
+ #import pdb; pdb.set_trace()
+ ret = self.current
+ if ret is None:
+ raise StopIteration()
+
+ self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+ self.current = self._get_current()
+ return ret
+
class Utf8Builder(object):
@specialize.argtype(1)
def __init__(self, init_size=None):
@@ -204,10 +441,19 @@
assert ord(c) < 128
self._builder.append(c)
- def append_slice(self, s, start, end, is_ascii=False):
- self._builder.append_slice(s, start, end)
- if not is_ascii:
- self._is_ascii = False
+ @specialize.argtype(1)
+ def append_slice(self, s, start, end):
+ if isinstance(s, str):
+ self._builder.append_slice(s, start, end)
+ elif isinstance(s, Utf8Str):
+ self._builder.append_slice(s.bytes, s.index_of_char(start),
+ s.index_of_char(end))
+ else:
+ raise TypeError("Invalid type '%s' for Utf8Str.append_slice" %
+ type(s))
+
+ def append_multiple_char(self, c, count):
+ self._builder.append_multiple_char(c, count)
def build(self):
return Utf8Str(self._builder.build(), self._is_ascii)
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -6,7 +6,7 @@
from rpython.rlib.unicodedata import unicodedb
from rpython.rlib.runicode import utf8_code_length
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord, ORD
BYTEORDER = sys.byteorder
@@ -33,7 +33,7 @@
# Non-escape characters are interpreted as Unicode ordinals
if ch != '\\':
- builder.append(ch)
+ builder.append(ord(ch))
pos += 1
continue
@@ -383,6 +383,8 @@
@specialize.arg_or_var(3)
def unicode_encode_ucs1_helper(p, size, errors,
errorhandler=None, limit=256):
+ if len(p) == 0:
+ return ''
if errorhandler is None:
errorhandler = default_unicode_error_encode
if limit == 256:
@@ -415,8 +417,9 @@
result.append(rs)
continue
for ch in ru:
- if ord(ch) < limit:
- result.append(chr(ord(ch)))
+ cd = ORD(ch, 0)
+ if cd < limit:
+ result.append(chr(cd))
else:
errorhandler("strict", encoding, reason, p,
collstart, collend)
@@ -436,15 +439,60 @@
# ____________________________________________________________
# utf-8 {{{
-# Converting bytes (utf8) to unicode?
-# I guess we just make sure we're looking at valid utf-8 and then make the
-# object?
def unicode_encode_utf_8(s, size, errors, errorhandler=None,
allow_surrogates=False):
- if size < len(s):
- return s.bytes[0:s.index_of_char(size)]
- return s.bytes
+ if len(s) == 0:
+ return ''
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
+
+ return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+ allow_surrogates)
+
+def unicode_encode_utf_8_impl(s, size, errors, errorhandler, allow_surrogates):
+ iter = s.codepoint_iter()
+ for oc in iter:
+ if oc >= 0xD800 and oc <= 0xDFFF:
+ break
+ if iter.pos == size:
+ return s.bytes
+ else:
+ return s.bytes
+
+ iter.move(-1)
+ result = Utf8Builder(len(s.bytes))
+ result.append_slice(s.bytes, 0, iter.byte_pos)
+
+ for oc in iter:
+ if oc >= 0xD800 and oc <= 0xDFFF:
+ # Check the next character to see if this is a surrogate pair
+ if (iter.pos != len(s) and oc <= 0xDBFF and
+ 0xDC00 <= iter.peek_next() <= 0xDFFF):
+ oc2 = iter.next()
+ result.append(((oc - 0xD800) << 10 | (oc2 - 0xDC00)) + 0x10000)
+ elif allow_surrogates:
+ result.append(oc)
+ else:
+ ru, rs, pos = errorhandler(errors, 'utf8',
+ 'surrogates not allowed', s,
+ iter.pos-1, iter.pos)
+ iter.move(pos - iter.pos)
+ if rs is not None:
+ # py3k only
+ result.append(rs)
+ continue
+ for ch in ru:
+ if ord(ch) < 0x80:
+ result.append(ch)
+ else:
+ errorhandler('strict', 'utf8',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ else:
+ result.append(oc)
+
+ return result.build().bytes
def str_decode_utf_8(s, size, errors, final=False,
errorhandler=None, allow_surrogates=False):
@@ -1219,7 +1267,7 @@
# ____________________________________________________________
# Charmap {{{
-ERROR_CHAR = u'\ufffe'
+ERROR_CHAR = Utf8Str.from_unicode(u'\ufffe')
@specialize.argtype(5)
def str_decode_charmap(s, size, errors, final=False,
@@ -1296,84 +1344,16 @@
def str_decode_unicode_internal(s, size, errors, final=False,
errorhandler=None):
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
- if size == 0:
- return u'', 0
-
- if MAXUNICODE < 65536:
- unicode_bytes = 2
+ if BYTEORDER == 'little':
+ return str_decode_utf_32_le(s, size, errors, errorhandler)
else:
- unicode_bytes = 4
- if BYTEORDER == "little":
- start = 0
- stop = unicode_bytes
- step = 1
- else:
- start = unicode_bytes - 1
- stop = -1
- step = -1
-
- result = UnicodeBuilder(size // unicode_bytes)
- pos = 0
- while pos < size:
- if pos > size - unicode_bytes:
- res, pos = errorhandler(errors, "unicode_internal",
- "truncated input",
- s, pos, size)
- result.append(res)
- if pos > size - unicode_bytes:
- break
- continue
- t = r_uint(0)
- h = 0
- for j in range(start, stop, step):
- t += r_uint(ord(s[pos + j])) << (h*8)
- h += 1
- if t > MAXUNICODE:
- res, pos = errorhandler(errors, "unicode_internal",
- "unichr(%d) not in range" % (t,),
- s, pos, pos + unicode_bytes)
- result.append(res)
- continue
- result.append(UNICHR(t))
- pos += unicode_bytes
- return result.build(), pos
+ return str_decode_utf_32_be(s, size, errors, errorhandler)
def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
- if size == 0:
- return ''
-
- if MAXUNICODE < 65536:
- unicode_bytes = 2
+ if BYTEORDER == 'little':
+ return unicode_encode_utf_32_le(s, size, errors, errorhandler)
else:
- unicode_bytes = 4
-
- result = StringBuilder(size * unicode_bytes)
- pos = 0
- while pos < size:
- oc = utf8ord(s, pos)
- if MAXUNICODE < 65536:
- if BYTEORDER == "little":
- result.append(chr(oc & 0xFF))
- result.append(chr(oc >> 8 & 0xFF))
- else:
- result.append(chr(oc >> 8 & 0xFF))
- result.append(chr(oc & 0xFF))
- else:
- if BYTEORDER == "little":
- result.append(chr(oc & 0xFF))
- result.append(chr(oc >> 8 & 0xFF))
- result.append(chr(oc >> 16 & 0xFF))
- result.append(chr(oc >> 24 & 0xFF))
- else:
- result.append(chr(oc >> 24 & 0xFF))
- result.append(chr(oc >> 16 & 0xFF))
- result.append(chr(oc >> 8 & 0xFF))
- result.append(chr(oc & 0xFF))
- pos += 1
-
- return result.build()
+ return unicode_encode_utf_32_be(s, size, errors, errorhandler)
# }}}
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -3,6 +3,7 @@
from rpython.rlib.rstring import UnicodeBuilder
from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
+from pypy.interpreter.utf8 import Utf8Builder, Utf8Str, utf8chr, utf8ord
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -206,13 +207,13 @@
w_end = space.getattr(w_exc, space.wrap('end'))
size = space.int_w(w_end) - space.int_w(w_start)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- text = u'?' * size
+ text = Utf8Str('?' * size, True)
return space.newtuple([space.wrap(text), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
- text = u'\ufffd'
+ text = utf8chr(0xfffd)
return space.newtuple([space.wrap(text), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
- text = u'\ufffd' * size
+ text = utf8chr(0xfffd) * size
return space.newtuple([space.wrap(text), w_end])
else:
raise oefmt(space.w_TypeError,
@@ -251,25 +252,26 @@
start = space.int_w(space.getattr(w_exc, space.wrap('start')))
w_end = space.getattr(w_exc, space.wrap('end'))
end = space.int_w(w_end)
- builder = UnicodeBuilder()
+
+ builder = Utf8Builder()
pos = start
while pos < end:
- oc = ord(obj[pos])
+ oc = utf8ord(obj, pos)
num = hex(oc)
if (oc >= 0x10000):
- builder.append(u"\\U")
+ builder.append("\\U")
zeros = 8
elif (oc >= 0x100):
- builder.append(u"\\u")
+ builder.append("\\u")
zeros = 4
else:
- builder.append(u"\\x")
+ builder.append("\\x")
zeros = 2
lnum = len(num)
nb = zeros + 2 - lnum # num starts with '0x'
if nb > 0:
builder.append_multiple_char(u'0', nb)
- builder.append_slice(unicode(num), 2, lnum)
+ builder.append_slice(num, 2, lnum)
pos += 1
return space.newtuple([space.wrap(builder.build()), w_end])
else:
@@ -378,7 +380,6 @@
# ____________________________________________________________
# delegation to runicode
-#from rpython.rlib import runicode
from pypy.interpreter import utf8_codecs
def make_encoder_wrapper(name):
@@ -548,7 +549,7 @@
if not 0 <= x <= 0x10FFFF:
raise oefmt(space.w_TypeError,
"character mapping must be in range(0x110000)")
- return code_to_unichr(x)
+ return utf8chr(x)
elif space.is_w(w_ch, space.w_None):
# Charmap may return None
return errorchar
@@ -566,7 +567,7 @@
# get the character from the mapping
try:
- w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+ w_ch = space.getitem(self.w_mapping, space.newint(utf8ord(ch)))
except OperationError, e:
if not e.match(space, space.w_LookupError):
raise
@@ -595,7 +596,7 @@
if errors is None:
errors = 'strict'
if len(string) == 0:
- return space.newtuple([space.wrap(u''), space.wrap(0)])
+ return space.newtuple([space.wrap(Utf8Str('')), space.wrap(0)])
if space.is_none(w_mapping):
mapping = None
@@ -631,7 +632,7 @@
w_charmap = space.newdict()
for num in range(len(chars)):
elem = chars[num]
- space.setitem(w_charmap, space.newint(ord(elem)), space.newint(num))
+ space.setitem(w_charmap, space.newint(utf8ord(elem)), space.newint(num))
return w_charmap
# ____________________________________________________________
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -10,6 +10,7 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import (
WrappedDefault, interp2app, interpindirect2app, unwrap_spec)
+from pypy.interpreter.utf8 import Utf8Str
from pypy.objspace.std import newformat
from pypy.objspace.std.basestringtype import basestring_typedef
from pypy.objspace.std.formatting import mod_format
@@ -715,11 +716,11 @@
sub = self_as_uni._op_val(space, w_old)
by = self_as_uni._op_val(space, w_new)
try:
- res = replace(input, sub, by, count)
+ res = replace(input.bytes, sub.bytes, by.bytes, count)
except OverflowError:
raise oefmt(space.w_OverflowError,
"replace string is too long")
- return self_as_uni._new(res)
+ return self_as_uni._new(Utf8Str(res))
return self._StringMethods_descr_replace(space, w_old, w_new, count)
_StringMethods_descr_join = descr_join
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -2,13 +2,14 @@
String formatting routines.
"""
import sys
-from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib import jit
from rpython.rlib.rfloat import formatd, DTSF_ALT, isnan, isinf
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
from rpython.rlib.unroll import unrolling_iterable
from rpython.rlib.rarithmetic import INT_MAX
from rpython.tool.sourcetools import func_with_new_name
+from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import Utf8Builder, ORD
class BaseStringFormatter(object):
@@ -168,7 +169,7 @@
def peekchr(self):
# return the 'current' character
try:
- return self.fmt[self.fmtpos]
+ return ORD(self.fmt, self.fmtpos)
except IndexError:
space = self.space
raise OperationError(space.w_ValueError,
@@ -185,16 +186,16 @@
pcount = 1
while 1:
try:
- c = fmt[i]
+ c = ORD(fmt, i)
except IndexError:
space = self.space
raise OperationError(space.w_ValueError,
space.wrap("incomplete format key"))
- if c == ')':
+ if c == ord(')'):
pcount -= 1
if pcount == 0:
break
- elif c == '(':
+ elif c == ord('('):
pcount += 1
i += 1
self.fmtpos = i + 1 # first character after ')'
@@ -210,7 +211,7 @@
return space.getitem(self.w_valuedict, w_key)
def parse_fmt(self):
- if self.peekchr() == '(':
+ if self.peekchr() == ord('('):
w_value = self.getmappingvalue(self.getmappingkey())
else:
w_value = None
@@ -223,7 +224,7 @@
self.f_ljust = True
self.width = -self.width
- if self.peekchr() == '.':
+ if self.peekchr() == ord('.'):
self.forward()
self.prec = self.peel_num('prec', INT_MAX)
if self.prec < 0:
@@ -232,7 +233,7 @@
self.prec = -1
c = self.peekchr()
- if c == 'h' or c == 'l' or c == 'L':
+ if c == ord('h') or c == ord('l') or c == ord('L'):
self.forward()
return w_value
@@ -247,15 +248,15 @@
self.f_zero = False
while True:
c = self.peekchr()
- if c == '-':
+ if c == ord('-'):
self.f_ljust = True
- elif c == '+':
+ elif c == ord('+'):
self.f_sign = True
- elif c == ' ':
+ elif c == ord(' '):
self.f_blank = True
- elif c == '#':
+ elif c == ord('#'):
self.f_alt = True
- elif c == '0':
+ elif c == ord('0'):
self.f_zero = True
else:
break
@@ -266,7 +267,7 @@
def peel_num(self, name, maxval):
space = self.space
c = self.peekchr()
- if c == '*':
+ if c == ord('*'):
self.forward()
w_value = self.nextinputvalue()
if name == 'width':
@@ -277,7 +278,7 @@
assert False
result = 0
while True:
- digit = ord(c) - ord('0')
+ digit = c - ord('0')
if not (0 <= digit <= 9):
break
if result > (maxval - digit) / 10:
@@ -291,16 +292,17 @@
def format(self):
lgt = len(self.fmt) + 4 * len(self.values_w) + 10
if do_unicode:
- result = UnicodeBuilder(lgt)
+ result = Utf8Builder(lgt)
else:
result = StringBuilder(lgt)
self.result = result
+
while True:
# fast path: consume as many characters as possible
fmt = self.fmt
i = i0 = self.fmtpos
while i < len(fmt):
- if fmt[i] == '%':
+ if ORD(fmt, i) == ord('%'):
break
i += 1
else:
@@ -313,8 +315,8 @@
w_value = self.parse_fmt()
c = self.peekchr()
self.forward()
- if c == '%':
- self.std_wp(const('%'))
+ if c == ord('%'):
+ self.std_wp('%')
continue
if w_value is None:
w_value = self.nextinputvalue()
@@ -325,7 +327,7 @@
if c == c1:
# 'c1' is an annotation constant here,
# so this getattr() is ok
- do_fmt = getattr(self, 'fmt_' + c1)
+ do_fmt = getattr(self, 'fmt_' + chr(c1))
do_fmt(w_value)
break
else:
@@ -348,7 +350,7 @@
else:
s = c
msg = "unsupported format character '%s' (0x%x) at index %d" % (
- s, ord(c), self.fmtpos - 1)
+ s, ORD(c, 0), self.fmtpos - 1)
raise OperationError(space.w_ValueError, space.wrap(msg))
def std_wp(self, r):
@@ -359,7 +361,7 @@
prec = self.prec
if prec == -1 and self.width == 0:
# fast path
- self.result.append(const(r))
+ self.result.append(r)
return
if prec >= 0 and prec < length:
length = prec # ignore the end of the string if too long
@@ -369,12 +371,12 @@
padding = 0
assert padding >= 0
if not self.f_ljust and padding > 0:
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
# add any padding at the left of 'r'
padding = 0
result.append_slice(r, 0, length) # add 'r' itself
if padding > 0:
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
# add any remaining padding at the right
std_wp._annspecialcase_ = 'specialize:argtype(1)'
@@ -405,18 +407,19 @@
assert padding >= 0
if padnumber == '>':
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
# pad with spaces on the left
if sign:
- result.append(const(r[0])) # the sign
- result.append(const(prefix)) # the prefix
+ # TODO: Why r[0]?
+ result.append(r[0]) # the sign
+ result.append(prefix) # the prefix
if padnumber == '0':
- result.append_multiple_char(const('0'), padding)
+ result.append_multiple_char('0', padding)
# pad with zeroes
- result.append_slice(const(r), int(sign), len(r))
+ result.append_slice(r, int(sign), len(r))
# the rest of the number
if padnumber == '<': # spaces on the right
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
def string_formatting(self, w_value):
space = self.space
@@ -499,7 +502,7 @@
# an "unrolling" list of all the known format characters,
# collected from which fmt_X() functions are defined in the class
FORMATTER_CHARS = unrolling_iterable(
- [_name[-1] for _name in dir(StringFormatter)
+ [ord(_name[-1]) for _name in dir(StringFormatter)
if len(_name) == 5 and _name.startswith('fmt_')])
def format(space, w_fmt, values_w, w_valuedict, do_unicode):
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -4,6 +4,7 @@
import string
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD
from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rfloat import copysign, formatd
@@ -47,7 +48,7 @@
def __init__(self, space, is_unicode, template):
self.space = space
self.is_unicode = is_unicode
- self.empty = u"" if is_unicode else ""
+ self.empty = Utf8Str("") if is_unicode else ""
self.template = template
def build(self, args):
@@ -59,7 +60,7 @@
def _build_string(self, start, end, level):
space = self.space
if self.is_unicode:
- out = rstring.UnicodeBuilder()
+ out = Utf8Builder()
else:
out = rstring.StringBuilder()
if not level:
@@ -74,23 +75,23 @@
space = self.space
last_literal = i = start
while i < end:
- c = s[i]
+ c = ORD(s, i)
i += 1
- if c == "{" or c == "}":
+ if c == ord("{") or c == ord("}"):
at_end = i == end
# Find escaped "{" and "}"
markup_follows = True
- if c == "}":
- if at_end or s[i] != "}":
+ if c == ord("}"):
+ if at_end or ORD(s, i) != ord("}"):
raise OperationError(space.w_ValueError,
space.wrap("Single '}'"))
i += 1
markup_follows = False
- if c == "{":
+ if c == ord("{"):
if at_end:
raise OperationError(space.w_ValueError,
space.wrap("Single '{'"))
- if s[i] == "{":
+ if ORD(s, i) == ord("{"):
i += 1
markup_follows = False
# Attach literal data, ending with { or }
@@ -111,11 +112,11 @@
field_start = i
recursive = False
while i < end:
- c = s[i]
- if c == "{":
+ c = ORD(s, i)
+ if c == ord("{"):
recursive = True
nested += 1
- elif c == "}":
+ elif c == ord("}"):
nested -= 1
if not nested:
break
@@ -139,9 +140,9 @@
i = start
while i < end:
c = s[i]
- if c == ":" or c == "!":
+ if c == ord(":") or c == ord("!"):
end_name = i
- if c == "!":
+ if c == ord("!"):
i += 1
if i == end:
w_msg = self.space.wrap("expected conversion")
@@ -170,7 +171,7 @@
end = len(name)
while i < end:
c = name[i]
- if c == "[" or c == ".":
+ if c == ord("[") or c == ord("."):
break
i += 1
empty = not i
@@ -228,12 +229,12 @@
i = start
while i < end:
c = name[i]
- if c == ".":
+ if c == ord("."):
i += 1
start = i
while i < end:
c = name[i]
- if c == "[" or c == ".":
+ if c == ord("[") or c == ord("."):
break
i += 1
if start == i:
@@ -245,13 +246,13 @@
else:
self.parser_list_w.append(space.newtuple([
space.w_True, w_attr]))
- elif c == "[":
+ elif c == ord("["):
got_bracket = False
i += 1
start = i
while i < end:
c = name[i]
- if c == "]":
+ if c == ord("]"):
got_bracket = True
break
i += 1
@@ -280,8 +281,8 @@
i = 0
end = len(name)
while i < end:
- c = name[i]
- if c == "[" or c == ".":
+ c = ORD(name, i)
+ if c == ord("[") or c == ord("."):
break
i += 1
if i == 0:
@@ -303,10 +304,10 @@
def _convert(self, w_obj, conversion):
space = self.space
- conv = conversion[0]
- if conv == "r":
+ conv = ORD(conversion, 0)
+ if conv == ord("r"):
return space.repr(w_obj)
- elif conv == "s":
+ elif conv == ord("s"):
if self.is_unicode:
return space.call_function(space.w_unicode, w_obj)
return space.str(w_obj)
@@ -416,15 +417,15 @@
self.spec = spec
def _is_alignment(self, c):
- return (c == "<" or
- c == ">" or
- c == "=" or
- c == "^")
+ return (c == ord("<") or
+ c == ord(">") or
+ c == ord("=") or
+ c == ord("^"))
def _is_sign(self, c):
- return (c == " " or
- c == "+" or
- c == "-")
+ return (c == ord(" ") or
+ c == ord("+") or
+ c == ord("-"))
def _parse_spec(self, default_type, default_align):
space = self.space
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -9,6 +9,7 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
+from pypy.interpreter.utf8 import ORD
from pypy.objspace.std import slicetype
from pypy.objspace.std.sliceobject import W_SliceObject, normalize_simple_slice
@@ -141,7 +142,7 @@
if d > 0:
offset = d//2 + (d & width & 1)
fillchar = self._multi_chr(fillchar[0])
- centered = offset * fillchar + value + (d - offset) * fillchar
+ centered = fillchar * offset + value + fillchar * (d - offset)
else:
centered = value
@@ -192,9 +193,9 @@
return self._empty()
if self._use_rstr_ops(space, self):
- splitted = value.split(self._chr('\t'))
+ splitted = value.split('\t')
else:
- splitted = split(value, self._chr('\t'))
+ splitted = split(value, '\t')
try:
ovfcheck(len(splitted) * tabsize)
@@ -203,7 +204,7 @@
expanded = oldtoken = splitted.pop(0)
for token in splitted:
- expanded += self._multi_chr(self._chr(' ')) * self._tabindent(oldtoken,
+ expanded += self._multi_chr(' ') * self._tabindent(oldtoken,
tabsize) + token
oldtoken = token
@@ -218,7 +219,8 @@
offset = len(token)
while 1:
- if token[offset-1] == "\n" or token[offset-1] == "\r":
+ if (ORD(token, offset-1) == ord("\n") or
+ ORD(token, offset-1) == ord("\r")):
break
distance += 1
offset -= 1
@@ -455,7 +457,7 @@
d = width - len(value)
if d > 0:
fillchar = self._multi_chr(fillchar[0])
- value += d * fillchar
+ value += fillchar * d
return self._new(value)
@@ -469,7 +471,7 @@
d = width - len(value)
if d > 0:
fillchar = self._multi_chr(fillchar[0])
- value = d * fillchar + value
+ value = fillchar * d + value
return self._new(value)
@@ -558,31 +560,39 @@
res = []
value = self._val(space)
if space.is_none(w_sep):
- res = split(value, maxsplit=maxsplit)
+ res = self._split(value, None, maxsplit)
return self._newlist_unwrapped(space, res)
by = self._op_val(space, w_sep)
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
- res = split(value, by, maxsplit)
+ res = self._split(value, by, maxsplit)
return self._newlist_unwrapped(space, res)
+ @staticmethod
+ def _split(value, sep=None, maxsplit=-1):
+ return split(value, sep, maxsplit)
+
@unwrap_spec(maxsplit=int)
def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
res = []
value = self._val(space)
if space.is_none(w_sep):
- res = rsplit(value, maxsplit=maxsplit)
+ res = self._rsplit(value, maxsplit=maxsplit)
return self._newlist_unwrapped(space, res)
by = self._op_val(space, w_sep)
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
- res = rsplit(value, by, maxsplit)
+ res = self._split(value, by, maxsplit)
return self._newlist_unwrapped(space, res)
+ @staticmethod
+ def _rsplit(value, sep=None, maxsplit=-1):
+ return value.split(sep, maxsplit)
+
@unwrap_spec(keepends=bool)
def descr_splitlines(self, space, keepends=False):
value = self._val(space)
@@ -757,20 +767,21 @@
def descr_zfill(self, space, width):
selfval = self._val(space)
if len(selfval) == 0:
- return self._new(self._multi_chr(self._chr('0')) * width)
+ return self._new(self._multi_chr('0') * width)
num_zeros = width - len(selfval)
if num_zeros <= 0:
# cannot return self, in case it is a subclass of str
return self._new(selfval)
builder = self._builder(width)
- if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'):
+ if len(selfval) > 0 and (ORD(selfval, 0) == ord('+') or
+ ORD(selfval, 0) == ord('-')):
# copy sign to first position
builder.append(selfval[0])
start = 1
else:
start = 0
- builder.append_multiple_char(self._chr('0'), num_zeros)
+ builder.append_multiple_char('0', num_zeros)
builder.append_slice(selfval, start, len(selfval))
return self._new(builder.build())
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -1,5 +1,6 @@
import py
import sys
+from pypy.interpreter.utf8 import Utf8Str
class TestUnicodeObject:
@@ -22,12 +23,12 @@
assert len(warnings) == 2
def test_listview_unicode(self):
- w_str = self.space.wrap(u'abcd')
+ w_str = self.space.wrap(Utf8Str.from_unicode(u'abcd'))
assert self.space.listview_unicode(w_str) == list(u"abcd")
def test_new_shortcut(self):
space = self.space
- w_uni = self.space.wrap(u'abcd')
+ w_uni = self.space.wrap(Utf8Str.from_unicode(u'abcd'))
w_new = space.call_method(
space.w_unicode, "__new__", space.w_unicode, w_uni)
assert w_new is w_uni
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1,13 +1,14 @@
"""The builtin unicode implementation"""
from rpython.rlib.objectmodel import (
- compute_hash, compute_unique_id, import_from_mixin)
+ compute_hash, compute_unique_id, import_from_mixin, specialize)
from rpython.rlib.buffer import StringBuffer
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import (
+ StringBuilder, replace, startswith, endswith)
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
-from pypy.interpreter.utf8 import Utf8Str, utf8chr, utf8ord
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
from pypy.interpreter.utf8_codecs import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
unicode_encode_ascii, unicode_encode_utf_8)
@@ -67,11 +68,12 @@
return self._value
def readbuf_w(self, space):
- from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
- builder = StringBuilder(len(self._value) * UNICODE_SIZE)
- for unich in self._value:
- pack_unichar(unich, builder)
- return StringBuffer(builder.build())
+ return StringBuffer(self._value.bytes)
+ #from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
+ #builder = StringBuilder(len(self._value) * UNICODE_SIZE)
+ #for unich in self._value:
+ # pack_unichar(unich, builder)
+ #return StringBuffer(builder.build())
def writebuf_w(self, space):
raise OperationError(space.w_TypeError, space.wrap(
@@ -87,7 +89,7 @@
raise oefmt(space.w_TypeError,
"ord() expected a character, but string of length %d "
"found", len(self._value))
- return space.wrap(utf8ord(self))
+ return space.wrap(utf8ord(self._value))
def _new(self, value):
return W_UnicodeObject(value)
@@ -120,9 +122,18 @@
def _chr(self, char):
assert len(char) == 1
- return unicode(char)[0]
+ assert ord(char) < 127
+ return Utf8Str(char, True)
- _builder = UnicodeBuilder
+ @specialize.argtype(1)
+ def _multi_chr(self, c):
+ if isinstance(c, str):
+ assert ord(c) < 127
+ return Utf8Str(c, True)
+ else:
+ return c
+
+ _builder = Utf8Builder
def _isupper(self, ch):
return unicodedb.isupper(utf8ord(ch))
@@ -158,13 +169,13 @@
return unicodedb.islinebreak(utf8ord(ch))
def _upper(self, ch):
- return unichr(unicodedb.toupper(utf8ord(ch)))
+ return utf8chr(unicodedb.toupper(utf8ord(ch)))
def _lower(self, ch):
- return unichr(unicodedb.tolower(utf8ord(ch)))
+ return utf8chr(unicodedb.tolower(utf8ord(ch)))
def _title(self, ch):
- return unichr(unicodedb.totitle(utf8ord(ch)))
+ return utf8chr(unicodedb.totitle(utf8ord(ch)))
def _newlist_unwrapped(self, space, lst):
return space.newlist_unicode(lst)
@@ -302,6 +313,35 @@
def descr_mod(self, space, w_values):
return mod_format(space, self, w_values, do_unicode=True)
+ @unwrap_spec(count=int)
+ def descr_replace(self, space, w_old, w_new, count=-1):
+ input = self._val(space)
+
+ sub = self._op_val(space, w_old)
+ by = self._op_val(space, w_new)
+ try:
+ res = replace(input.bytes, sub.bytes, by.bytes, count)
+ except OverflowError:
+ raise oefmt(space.w_OverflowError, "replace string is too long")
+
+ return self._new(Utf8Str(res))
+
+ def _startswith(self, space, value, w_prefix, start, end):
+ return startswith(value.bytes, self._op_val(space, w_prefix).bytes,
+ start, end)
+
+ def _endswith(self, space, value, w_prefix, start, end):
+ return endswith(value.bytes, self._op_val(space, w_prefix).bytes,
+ start, end)
+
+ @staticmethod
+ def _split(value, sep=None, maxsplit=-1):
+ return value.split(sep, maxsplit)
+
+ @staticmethod
+ def _rsplit(value, sep=None, maxsplit=-1):
+ return value.split(sep, maxsplit)
+
def descr_translate(self, space, w_table):
selfvalue = self._value
w_sys = space.getbuiltinmodule('sys')
@@ -313,7 +353,7 @@
w_newval = space.getitem(w_table, space.wrap(utf8ord(unichar)))
except OperationError as e:
if e.match(space, space.w_LookupError):
- result.append(unichar)
+ result.append(unichar.bytes)
else:
raise
else:
@@ -325,14 +365,14 @@
raise oefmt(space.w_TypeError,
"character mapping must be in range(%s)",
hex(maxunicode + 1))
- result.append(unichr(newval))
+ result.append(utf8chr(newval).bytes)
elif space.isinstance_w(w_newval, space.w_unicode):
- result.append(space.unicode_w(w_newval))
+ result.append(space.unicode_w(w_newval).bytes)
else:
raise oefmt(space.w_TypeError,
"character mapping must return integer, None "
"or unicode")
- return W_UnicodeObject(u''.join(result))
+ return W_UnicodeObject(Utf8Str(''.join(result)))
def descr_encode(self, space, w_encoding=None, w_errors=None):
encoding, errors = _get_encoding_and_errors(space, w_encoding,
@@ -1090,7 +1130,7 @@
digits = ['0', '1', '2', '3', '4',
'5', '6', '7', '8', '9']
for i in xrange(len(unistr)):
- uchr = ord(unistr[i])
+ uchr = utf8ord(unistr, i)
if unicodedb.isspace(uchr):
result[i] = ' '
continue
More information about the pypy-commit
mailing list