[pypy-commit] pypy utf8-unicode2: Fix cpyext

Thu Jul 10 17:23:26 CEST 2014

Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72414:e6b1c681e8ec
Date: 2014-07-09 03:30 -0500
http://bitbucket.org/pypy/pypy/changeset/e6b1c681e8ec/

Log:	Fix cpyext

diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -4,6 +4,7 @@
 import sys
 from pypy.interpreter.utf8 import (
     Utf8Str, Utf8Builder, utf8chr, utf8ord)
+from rpython.rtyper.lltypesystem import rffi
 
 def build_utf8str():
     builder = Utf8Builder()
@@ -193,3 +194,15 @@
     assert s.rsplit(maxsplit=2) == u.rsplit(None, 2)
     assert s.rsplit(' ', 2) == u.rsplit(' ', 2)
     assert s.rsplit('\n') == [s]
+
+def test_copy_to_wcharp():
+    s = build_utf8str()
+    if sys.maxunicode < 0x10000:
+        # The last character requires a surrogate pair on narrow builds and
+        # so won't be converted correctly by rffi.wcharp2unicode
+        s = s[:-1]
+
+    wcharp = s.copy_to_wcharp()
+    u = rffi.wcharp2unicode(wcharp)
+    rffi.free_wcharp(wcharp)
+    assert s == u
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -3,6 +3,7 @@
 from rpython.rlib.runicode import utf8_code_length
 from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
 from rpython.rlib.rarithmetic import r_uint
+from rpython.rtyper.lltypesystem import rffi
 
 def utf8chr(value):
     # Like unichr, but returns a Utf8Str object
@@ -73,6 +74,8 @@
         self._len = length
 
     def index_of_char(self, char):
+        if char >= len(self):
+            return len(self.bytes)
         byte = 0
         pos = 0
         while pos < char:
@@ -412,6 +415,14 @@
             byte_pos -= 1
         return byte_pos
 
+    def copy_to_wcharp(self):
+        # XXX Temporary solution. This won't work on correctly on systems
+        #     where sizeof(wchar_t) == 2. Also, it copies twice.
+        from pypy.interpreter.utf8_codecs import unicode_encode_unicode_internal
+        from rpython.rlib.runicode import MAXUNICODE
+        bytes = unicode_encode_unicode_internal(self, len(self), 'strict')
+        return rffi.cast(rffi.CWCHARP, rffi.str2charp(bytes))
+
 
 
 class Utf8Builder(object):
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -1538,8 +1538,8 @@
         if rs is not None:
             # py3k only
             errorhandler('strict', 'decimal', msg, s, collstart, collend)
-        for char in ru:
-            ch = ord(char)
+        for i in range(len(ru)):
+            ch = ORD(ru, i)
             if unicodedb.isspace(ch):
                 result.append(' ')
                 continue
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -229,7 +229,7 @@
         builder = UnicodeBuilder()
         pos = start
         while pos < end:
-            code = ord(obj[pos])
+            code = utf8ord(obj, pos)
             if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and
                        pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF):
                 code = (code & 0x03FF) << 10
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -188,7 +188,7 @@
         
         w_u = api.PyUnicode_DecodeUTF8(u, 2, None)
         assert space.type(w_u) is space.w_unicode
-        assert space.unwrap(w_u) == 'sp'
+        assert space.unwrap(w_u) == u'sp'
         rffi.free_charp(u)
 
     def test_encode_utf8(self, space, api):
@@ -296,7 +296,7 @@
         w_u = space.wrap(u'a')
         assert api.PyUnicode_FromObject(w_u) is w_u
         assert space.unwrap(
-            api.PyUnicode_FromObject(space.wrap('test'))) == 'test'
+            api.PyUnicode_FromObject(space.wrap('test'))) == u'test'
 
     def test_decode(self, space, api):
         b_text = rffi.str2charp('caf\x82xx')
@@ -306,7 +306,7 @@
 
         w_text = api.PyUnicode_FromEncodedObject(space.wrap("test"), b_encoding, None)
         assert space.isinstance_w(w_text, space.w_unicode)
-        assert space.unwrap(w_text) == "test"
+        assert space.unwrap(w_text) == u"test"
 
         assert api.PyUnicode_FromEncodedObject(space.wrap(u"test"), b_encoding, None) is None
         assert api.PyErr_Occurred() is space.w_TypeError
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,4 +1,5 @@
 from pypy.interpreter.error import OperationError
+from pypy.interpreter import utf8_codecs
 from rpython.rtyper.lltypesystem import rffi, lltype
 from pypy.module.unicodedata import unicodedb
 from pypy.module.cpyext.api import (
@@ -208,7 +209,7 @@
         # Copy unicode buffer
         w_unicode = from_ref(space, ref)
         u = space.unicode_w(w_unicode)
-        ref_unicode.c_buffer = rffi.unicode2wcharp(u)
+        ref_unicode.c_buffer = u.copy_to_wcharp()
     return ref_unicode.c_buffer
 
 @cpython_api([PyObject], rffi.CWCHARP)
@@ -552,7 +553,7 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_16_helper(
+    result, length, byteorder = utf8_codecs.str_decode_utf_16_helper(
         string, size, errors,
         True, # final ? false for multiple passes?
         None, # errorhandler
@@ -608,7 +609,7 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_32_helper(
+    result, length, byteorder = utf8_codecs.str_decode_utf_32_helper(
         string, size, errors,
         True, # final ? false for multiple passes?
         None, # errorhandler
@@ -640,7 +641,7 @@
     else:
         errors = None
     state = space.fromcache(CodecState)
-    result = runicode.unicode_encode_decimal(u, length, errors,
+    result = utf8_codecs.unicode_encode_decimal(u, length, errors,
                                              state.encode_error_handler)
     i = len(result)
     output[i] = '\0'
@@ -691,10 +692,12 @@
     suffix match), 0 otherwise. Return -1 if an error occurred."""
     str = space.unicode_w(w_str)
     substr = space.unicode_w(w_substr)
+    start = str.index_of_char(start)
+    end = str.index_of_char(end)
     if rffi.cast(lltype.Signed, direction) <= 0:
-        return rstring.startswith(str, substr, start, end)
+        return rstring.startswith(str.bytes, substr.bytes, start, end)
     else:
-        return rstring.endswith(str, substr, start, end)
+        return rstring.endswith(str.bytes, substr.bytes, start, end)
 
 @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1)
 def PyUnicode_Count(space, w_str, w_substr, start, end):