[pypy-commit] pypy py3.6: PyUnicode_New()

Wed Oct 30 05:33:23 EDT 2019

Author: Armin Rigo <arigo at tunes.org>
Branch: py3.6
Changeset: r97891:699e6250c3cc
Date: 2019-10-30 10:32 +0100
http://bitbucket.org/pypy/pypy/changeset/699e6250c3cc/

Log:	PyUnicode_New()

diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -391,6 +391,38 @@
         s = module.asutf32(u)
         assert s == u.encode('utf-32')
 
+    def test_UnicodeNew(self):
+        module = self.import_extension('unicodenew', [
+            ("make", "METH_VARARGS",
+            """
+                long length = PyLong_AsLong(PyTuple_GetItem(args, 0));
+                long unichr = PyLong_AsLong(PyTuple_GetItem(args, 1));
+
+                PyObject *retval = PyUnicode_New(length, (Py_UCS4)unichr);
+                if (unichr <= 255) {
+                    Py_UCS1 *retbuf = PyUnicode_1BYTE_DATA(retval);
+                    for (long i = 0; i < length; i++)
+                        retbuf[i] = unichr;
+                }
+                else if (unichr <= 65535) {
+                    Py_UCS2 *retbuf = PyUnicode_2BYTE_DATA(retval);
+                    for (long i = 0; i < length; i++)
+                        retbuf[i] = unichr;
+                }
+                else {
+                    Py_UCS4 *retbuf = PyUnicode_4BYTE_DATA(retval);
+                    for (long i = 0; i < length; i++)
+                        retbuf[i] = unichr;
+                }
+                return retval;
+            """),
+            ])
+        assert module.make(0, 32) == u''
+        assert module.make(1, 32) == u' '
+        assert module.make(5, 255) == u'\xff' * 5
+        assert module.make(3, 0x1234) == u'\u1234' * 3
+        assert module.make(7, 0x12345) == u'\U00012345' * 7
+
 
 class TestUnicode(BaseApiTest):
     def test_unicodeobject(self, space):
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -8,12 +8,13 @@
 from pypy.interpreter.unicodehelper import (
     wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
     unicode_encode_decimal, utf8_encode_utf_16_helper, BYTEORDER,
-    utf8_encode_utf_32_helper)
+    utf8_encode_utf_32_helper, str_decode_latin_1)
 from pypy.objspace.std.unicodeobject import unicodedb
 from pypy.module.cpyext.api import (
     CANNOT_FAIL, Py_ssize_t, build_type_checkers, cpython_api,
     bootstrap_function, CONST_STRING, INTP_real,
-    CONST_WSTRING, Py_CLEANUP_SUPPORTED, slot_function, cts, parse_dir)
+    CONST_WSTRING, Py_CLEANUP_SUPPORTED, slot_function, cts, parse_dir,
+    PyTypeObjectPtr)
 from pypy.module.cpyext.pyerrors import PyErr_BadArgument
 from pypy.module.cpyext.pyobject import (
     PyObject, PyObjectP, decref, make_ref, from_ref, track_reference,
@@ -22,6 +23,7 @@
 from pypy.module._codecs.interp_codecs import (
     CodecState, latin_1_decode, utf_16_decode, utf_32_decode)
 from pypy.objspace.std import unicodeobject
+from rpython.rlib.debug import fatalerror
 import sys
 
 ## See comment in bytesobject.py.
@@ -91,8 +93,31 @@
     Creates the unicode in the interpreter. The PyUnicodeObject buffer must not
     be modified after this call. Can raise in wcharpsize2utf8
     """
-    lgt = get_wsize(py_obj)
-    s_utf8 = wcharpsize2utf8(space, get_wbuffer(py_obj), lgt)
+    if not get_wbuffer(py_obj):
+        if not get_compact(py_obj):
+            fatalerror(
+                "internal cpyext error: realizing a non-compact unicode "
+                "object with wbuffer == null")
+        data = get_data(py_obj)
+        size = get_len(py_obj)
+        kind = get_kind(py_obj)
+        value = rffi.charpsize2str(data, size * kind)
+        if kind == _1BYTE_KIND:
+            s_utf8, lgt, _ = str_decode_latin_1(value, 'strict', True, None)
+        elif kind == _2BYTE_KIND:
+            decoded = str_decode_utf_16_helper(value, 'strict', True, None,
+                                               byteorder=BYTEORDER)
+            s_utf8, lgt = decoded[:2]
+        elif kind == _4BYTE_KIND:
+            decoded = str_decode_utf_32_helper(value, 'strict', True, None,
+                                               byteorder=BYTEORDER)
+            s_utf8, lgt = decoded[:2]
+        else:
+            assert False
+    else:
+        lgt = get_wsize(py_obj)
+        s_utf8 = wcharpsize2utf8(space, get_wbuffer(py_obj), lgt)
+
     w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
     w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
     w_obj.__init__(s_utf8, lgt)
@@ -191,6 +216,15 @@
     py_obj.c_wstr_length = value
 
 def get_data(py_obj):
+    if get_compact(py_obj):
+        if get_ascii(py_obj):
+            PyASCIIObject = cts.gettype('PyASCIIObject')
+            struct_size = rffi.sizeof(PyASCIIObject)
+        else:
+            PyCompactUnicodeObject = cts.gettype('PyCompactUnicodeObject')
+            struct_size = rffi.sizeof(PyCompactUnicodeObject)
+        data = rffi.ptradd(rffi.cast(rffi.CCHARP, py_obj), struct_size)
+        return cts.cast('void *', data)
     py_obj = cts.cast('PyUnicodeObject*', py_obj)
     return py_obj.c_data
 
@@ -1138,3 +1172,64 @@
 def PyUnicode_AsUCS4Copy(space, ref):
     return PyUnicode_AsUCS4(space, ref, cts.cast('Py_UCS4*', 0), 0,
                             rffi.cast(rffi.INT_real, 1))
+
+ at cts.decl("PyObject* PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)",
+          result_is_ll=True)
+def PyUnicode_New(space, size, maxchar):
+    PyASCIIObject = cts.gettype('PyASCIIObject')
+    PyCompactUnicodeObject = cts.gettype('PyCompactUnicodeObject')
+
+    is_ascii = False
+    is_sharing = False
+    struct_size = rffi.sizeof(PyCompactUnicodeObject)
+    if maxchar < 128:
+        kind = _1BYTE_KIND
+        char_size = 1
+        is_ascii = True
+        struct_size = rffi.sizeof(PyASCIIObject)
+    elif maxchar < 256:
+        kind = _1BYTE_KIND
+        char_size = 1
+    elif maxchar < 65536:
+        kind = _2BYTE_KIND
+        char_size = 2
+        if rffi.sizeof(lltype.UniChar) == 2:
+            is_sharing = True
+    else:
+        if maxchar > MAX_UNICODE:
+            raise oefmt(space.w_SystemError,
+                        "invalid maximum character passed to PyUnicode_New")
+        kind = _4BYTE_KIND
+        char_size = 4
+        if rffi.sizeof(lltype.UniChar) == 4:
+            is_sharing = True
+
+    # Ensure we won't overflow the size.
+    if size < 0:
+        raise oefmt(space.w_SystemError,
+                    "Negative size passed to PyUnicode_New")
+    if size > ((sys.maxint - struct_size) / char_size - 1):
+        raise oefmt(space.w_MemoryError, "PyUnicode_New: size too big")
+
+    # Duplicated allocation code from _PyObject_New() instead of a call to
+    # PyObject_New() so we are able to allocate space for the object and
+    # its data buffer.
+    pytype = as_pyobj(space, space.w_unicode)
+    pytype = rffi.cast(PyTypeObjectPtr, pytype)
+    buf = lltype.malloc(rffi.VOIDP.TO, struct_size + (size + 1) * char_size,
+                        flavor='raw', zero=True,
+                        add_memory_pressure=True)
+    pyobj = rffi.cast(PyObject, buf)
+    pyobj.c_ob_refcnt = 1
+    #pyobj.c_ob_pypy_link remains null for now
+    pyobj.c_ob_type = pytype
+
+    set_len(pyobj, size)
+    set_kind(pyobj, kind)
+    set_compact(pyobj, True)
+    set_ready(pyobj, True)
+    set_ascii(pyobj, is_ascii)
+    if is_sharing:
+        set_wsize(pyobj, size)
+        set_wbuffer(pyobj, rffi.cast(rffi.CWCHARP, get_data(pyobj)))
+    return pyobj