[pypy-commit] pypy utf8-unicode2: Fix cpyext
waedt
noreply at buildbot.pypy.org
Thu Jul 10 17:23:26 CEST 2014
Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72414:e6b1c681e8ec
Date: 2014-07-09 03:30 -0500
http://bitbucket.org/pypy/pypy/changeset/e6b1c681e8ec/
Log: Fix cpyext
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -4,6 +4,7 @@
import sys
from pypy.interpreter.utf8 import (
Utf8Str, Utf8Builder, utf8chr, utf8ord)
+from rpython.rtyper.lltypesystem import rffi
def build_utf8str():
builder = Utf8Builder()
@@ -193,3 +194,15 @@
assert s.rsplit(maxsplit=2) == u.rsplit(None, 2)
assert s.rsplit(' ', 2) == u.rsplit(' ', 2)
assert s.rsplit('\n') == [s]
+
+def test_copy_to_wcharp():
+ s = build_utf8str()
+ if sys.maxunicode < 0x10000:
+ # The last character requires a surrogate pair on narrow builds and
+ # so won't be converted correctly by rffi.wcharp2unicode
+ s = s[:-1]
+
+ wcharp = s.copy_to_wcharp()
+ u = rffi.wcharp2unicode(wcharp)
+ rffi.free_wcharp(wcharp)
+ assert s == u
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -3,6 +3,7 @@
from rpython.rlib.runicode import utf8_code_length
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
from rpython.rlib.rarithmetic import r_uint
+from rpython.rtyper.lltypesystem import rffi
def utf8chr(value):
# Like unichr, but returns a Utf8Str object
@@ -73,6 +74,8 @@
self._len = length
def index_of_char(self, char):
+ if char >= len(self):
+ return len(self.bytes)
byte = 0
pos = 0
while pos < char:
@@ -412,6 +415,14 @@
byte_pos -= 1
return byte_pos
+ def copy_to_wcharp(self):
+ # XXX Temporary solution. This won't work on correctly on systems
+ # where sizeof(wchar_t) == 2. Also, it copies twice.
+ from pypy.interpreter.utf8_codecs import unicode_encode_unicode_internal
+ from rpython.rlib.runicode import MAXUNICODE
+ bytes = unicode_encode_unicode_internal(self, len(self), 'strict')
+ return rffi.cast(rffi.CWCHARP, rffi.str2charp(bytes))
+
class Utf8Builder(object):
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -1538,8 +1538,8 @@
if rs is not None:
# py3k only
errorhandler('strict', 'decimal', msg, s, collstart, collend)
- for char in ru:
- ch = ord(char)
+ for i in range(len(ru)):
+ ch = ORD(ru, i)
if unicodedb.isspace(ch):
result.append(' ')
continue
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -229,7 +229,7 @@
builder = UnicodeBuilder()
pos = start
while pos < end:
- code = ord(obj[pos])
+ code = utf8ord(obj, pos)
if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and
pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF):
code = (code & 0x03FF) << 10
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -188,7 +188,7 @@
w_u = api.PyUnicode_DecodeUTF8(u, 2, None)
assert space.type(w_u) is space.w_unicode
- assert space.unwrap(w_u) == 'sp'
+ assert space.unwrap(w_u) == u'sp'
rffi.free_charp(u)
def test_encode_utf8(self, space, api):
@@ -296,7 +296,7 @@
w_u = space.wrap(u'a')
assert api.PyUnicode_FromObject(w_u) is w_u
assert space.unwrap(
- api.PyUnicode_FromObject(space.wrap('test'))) == 'test'
+ api.PyUnicode_FromObject(space.wrap('test'))) == u'test'
def test_decode(self, space, api):
b_text = rffi.str2charp('caf\x82xx')
@@ -306,7 +306,7 @@
w_text = api.PyUnicode_FromEncodedObject(space.wrap("test"), b_encoding, None)
assert space.isinstance_w(w_text, space.w_unicode)
- assert space.unwrap(w_text) == "test"
+ assert space.unwrap(w_text) == u"test"
assert api.PyUnicode_FromEncodedObject(space.wrap(u"test"), b_encoding, None) is None
assert api.PyErr_Occurred() is space.w_TypeError
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,4 +1,5 @@
from pypy.interpreter.error import OperationError
+from pypy.interpreter import utf8_codecs
from rpython.rtyper.lltypesystem import rffi, lltype
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
@@ -208,7 +209,7 @@
# Copy unicode buffer
w_unicode = from_ref(space, ref)
u = space.unicode_w(w_unicode)
- ref_unicode.c_buffer = rffi.unicode2wcharp(u)
+ ref_unicode.c_buffer = u.copy_to_wcharp()
return ref_unicode.c_buffer
@cpython_api([PyObject], rffi.CWCHARP)
@@ -552,7 +553,7 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_16_helper(
+ result, length, byteorder = utf8_codecs.str_decode_utf_16_helper(
string, size, errors,
True, # final ? false for multiple passes?
None, # errorhandler
@@ -608,7 +609,7 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_32_helper(
+ result, length, byteorder = utf8_codecs.str_decode_utf_32_helper(
string, size, errors,
True, # final ? false for multiple passes?
None, # errorhandler
@@ -640,7 +641,7 @@
else:
errors = None
state = space.fromcache(CodecState)
- result = runicode.unicode_encode_decimal(u, length, errors,
+ result = utf8_codecs.unicode_encode_decimal(u, length, errors,
state.encode_error_handler)
i = len(result)
output[i] = '\0'
@@ -691,10 +692,12 @@
suffix match), 0 otherwise. Return -1 if an error occurred."""
str = space.unicode_w(w_str)
substr = space.unicode_w(w_substr)
+ start = str.index_of_char(start)
+ end = str.index_of_char(end)
if rffi.cast(lltype.Signed, direction) <= 0:
- return rstring.startswith(str, substr, start, end)
+ return rstring.startswith(str.bytes, substr.bytes, start, end)
else:
- return rstring.endswith(str, substr, start, end)
+ return rstring.endswith(str.bytes, substr.bytes, start, end)
@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1)
def PyUnicode_Count(space, w_str, w_substr, start, end):
More information about the pypy-commit
mailing list