[Python-checkins] Revert "bpo-39087: Add _PyUnicode_GetUTF8Buffer()" (GH-18985)

Sat Mar 14 02:59:32 EDT 2020

https://github.com/python/cpython/commit/3a8c56295d6272ad2177d2de8af4c3f824f3ef92
commit: 3a8c56295d6272ad2177d2de8af4c3f824f3ef92
branch: master
author: Inada Naoki <songofacandy at gmail.com>
committer: GitHub <noreply at github.com>
date: 2020-03-14T15:59:27+09:00
summary:

Revert "bpo-39087: Add _PyUnicode_GetUTF8Buffer()" (GH-18985)

* Revert "bpo-39087: Add _PyUnicode_GetUTF8Buffer() (GH-17659)"

This reverts commit c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b.

* Update unicodeobject.h

files:
D Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst
M Include/cpython/unicodeobject.h
M Lib/test/test_unicode.py
M Modules/_testcapimodule.c
M Objects/unicodeobject.c

diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index be91d2d9fc675..0df64790c1c22 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -734,19 +734,6 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
 
 /* --- Manage the default encoding ---------------------------------------- */
 
-/* Get a buffer to the UTF-8 encoding of the Unicode object unicode.
-   Returns -1 on error.
-
-   Successful calls must be paired to
-   calls to PyBuffer_Release.
-*/
-
-PyAPI_FUNC(int) _PyUnicode_GetUTF8Buffer(
-    PyObject *unicode,      /* Unicode object */
-    const char *errors,     /* error handling */
-    Py_buffer *view         /* (out) buffer to the UTF-8 encoding */
-    );
-
 /* Returns a pointer to the default encoding (UTF-8) of the
    Unicode object unicode and the size of the encoded representation
    in bytes stored in *size.
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 0522513777f60..2839889646789 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -2830,28 +2830,6 @@ def test_asucs4(self):
             self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
             self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
 
-    # Test _PyUnicode_GetUTF8Buffer()
-    @support.cpython_only
-    def test_getutf8buffer(self):
-        from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer
-
-        # Run tests wrtten in C.  Raise an error when test failed.
-        unicode_test_getutf8buffer()
-
-        ascii_ = "foo"
-        bmp = '\u0100'
-        bmp2 = '\uffff'
-        nonbmp = chr(0x10ffff)
-        surrogates = 'a\ud800b\udfffc'
-
-        self.assertEqual(unicode_getutf8buffer(ascii_), b'foo')
-        self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80')
-        self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf')
-        self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf')
-        self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates)
-        self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"),
-                         b'a\xed\xa0\x80b\xed\xbf\xbfc')
-
     # Test PyUnicode_AsUTF8()
     @support.cpython_only
     def test_asutf8(self):
diff --git a/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst b/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst
deleted file mode 100644
index 2c2c85d93b211..0000000000000
--- a/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst	
+++ /dev/null
@@ -1,2 +0,0 @@
-Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the
-unicode object without cache or extra allocation.
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index 09b77064de15e..3cc558689b6c1 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -1967,216 +1967,6 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
     return Py_BuildValue("(Nn)", result, utf8_len);
 }
 
-static PyObject *
-unicode_getutf8buffer(PyObject *self, PyObject *args)
-{
-    PyObject *unicode;
-    const char *errors = NULL;
-    if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) {
-        return NULL;
-    }
-
-    Py_buffer buffer;
-    if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) {
-        return NULL;
-    }
-
-    assert(buffer.obj != NULL);
-    assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj));
-
-    PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len);
-    PyBuffer_Release(&buffer);
-    return result;
-}
-
-static PyObject *
-unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored))
-{
-    Py_buffer buf;
-
-    // Test 1: ASCII string
-    PyObject *str = PyUnicode_FromString("hello");
-    if (str == NULL) {
-        return NULL;
-    }
-    Py_ssize_t refcnt = Py_REFCNT(str);
-
-    // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
-    int ret = _PyUnicode_GetUTF8Buffer(str, NULL,  &buf);
-    assert(ret == 0);
-
-    if (buf.obj != str) {
-        PyErr_Format(TestError,
-                     "buf.obj must be equal to str. (%s:%d)",
-                     __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    if (buf.len != PyUnicode_GET_LENGTH(str)) {
-        PyErr_Format(TestError,
-                     "buf.len must be equal to len(str). (%s:%d)",
-                     __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-    assert(((const char*)buf.buf)[5] == '\0');
-
-    if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) {
-        PyErr_Format(TestError,
-                     "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)",
-                     __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    if (refcnt + 1 != Py_REFCNT(str)) {
-        PyErr_Format(TestError,
-                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
-                     refcnt + 1, Py_REFCNT(str),
-                     __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    PyBuffer_Release(&buf);
-
-    if (refcnt != Py_REFCNT(str)) {
-        PyErr_Format(TestError,
-                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
-                     refcnt, Py_REFCNT(str),
-                     __FILE__, __LINE__);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    Py_DECREF(str);
-
-    // Test 2: non-ASCII string
-
-    // "hello" in Japanese.  len(str)==5, len(str.encode()) == 15.
-    str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf");
-    if (str == NULL) {
-        return NULL;
-    }
-    refcnt = Py_REFCNT(str);
-    assert(PyUnicode_GET_LENGTH(str) == 5);
-
-    if (_PyUnicode_GetUTF8Buffer(str, NULL,  &buf) < 0) {
-        Py_DECREF(str);
-        if (!PyErr_Occurred()) {
-            PyErr_Format(TestError,
-                         "_PyUnicode_GetUTF8Buffer() returned nonzero "
-                         "without exception set. (%s:%d)",
-                         __FILE__, __LINE__);
-        }
-        return NULL;
-    }
-
-    if (!PyBytes_CheckExact(buf.obj)) {
-        PyErr_Format(TestError,
-                     "buf.obj must be a bytes object, got %R (%s:%d)",
-                     buf.obj, __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    if (buf.len != 15) {
-        PyErr_Format(TestError,
-                     "Expected buf.len == 15, actual %zd (%s:%d)",
-                     buf.len, __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-    assert(((const char*)buf.buf)[15] == '\0');
-
-    if (refcnt != Py_REFCNT(str)) {
-        PyErr_Format(TestError,
-                     "Py_REFCNT(str) must not be changed. (%s:%d)",
-                     __FILE__, __LINE__);
-        // Do not DECREF here because refcnt is broken.
-        return NULL;
-    }
-
-    PyBuffer_Release(&buf);
-
-    // Test 3: There is a UTF-8 cache
-    // Reuse str of the previoss test.
-
-    const char *cache = PyUnicode_AsUTF8(str);
-    if (cache == NULL) {
-        return NULL;
-    }
-
-    if (_PyUnicode_GetUTF8Buffer(str, NULL,  &buf) < 0) {
-        Py_DECREF(str);
-        if (!PyErr_Occurred()) {
-            PyErr_Format(TestError,
-                         "_PyUnicode_GetUTF8Buffer() returned nonzero "
-                         "without exception set. (%s:%d)",
-                         __FILE__, __LINE__);
-        }
-        return NULL;
-    }
-
-    if (buf.obj != str) {
-        PyErr_Format(TestError,
-                     "buf.obj must be equal to str. (%s:%d)",
-                     __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    if (buf.buf != cache) {
-        PyErr_Format(TestError,
-                     "buf.buf must be equal to the UTF-8 cache (%s:%d)",
-                     __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-
-    if (buf.len != 15) {
-        PyErr_Format(TestError,
-                     "Expected buf.len == 15, actual %zd (%s:%d)",
-                     buf.len, __FILE__, __LINE__);
-        PyBuffer_Release(&buf);
-        Py_DECREF(str);
-        return NULL;
-    }
-    assert(((const char*)buf.buf)[15] == '\0');
-
-    if (refcnt + 1 != Py_REFCNT(str)) {
-        PyErr_Format(TestError,
-                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
-                     refcnt + 1, Py_REFCNT(str),
-                     __FILE__, __LINE__);
-        // Do not DECREF here because refcnt is broken.
-        return NULL;
-    }
-
-    PyBuffer_Release(&buf);
-
-    if (refcnt != Py_REFCNT(str)) {
-        PyErr_Format(TestError,
-                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
-                     refcnt, Py_REFCNT(str),
-                     __FILE__, __LINE__);
-        // Do not DECREF here because refcnt is broken.
-        return NULL;
-    }
-
-    Py_DECREF(str);
-    Py_RETURN_NONE;
-}
-
 static PyObject *
 unicode_findchar(PyObject *self, PyObject *args)
 {
@@ -5602,8 +5392,6 @@ static PyMethodDef TestMethods[] = {
     {"unicode_asucs4",          unicode_asucs4,                  METH_VARARGS},
     {"unicode_asutf8",          unicode_asutf8,                  METH_VARARGS},
     {"unicode_asutf8andsize",   unicode_asutf8andsize,           METH_VARARGS},
-    {"unicode_getutf8buffer",   unicode_getutf8buffer,           METH_VARARGS},
-    {"unicode_test_getutf8buffer", unicode_test_getutf8buffer,   METH_NOARGS},
     {"unicode_findchar",        unicode_findchar,                METH_VARARGS},
     {"unicode_copycharacters",  unicode_copycharacters,          METH_VARARGS},
     {"unicode_encodedecimal",   unicode_encodedecimal,           METH_VARARGS},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0fea435599be8..3d99f11ecff6f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3991,41 +3991,6 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
 }
 
 
-int
-_PyUnicode_GetUTF8Buffer(PyObject *unicode, const char *errors,
-                         Py_buffer *view)
-{
-    if (!PyUnicode_Check(unicode)) {
-        PyErr_BadArgument();
-        return -1;
-    }
-    if (PyUnicode_READY(unicode) == -1) {
-        return -1;
-    }
-
-    if (PyUnicode_UTF8(unicode) != NULL
-            && Py_TYPE(unicode)->tp_as_buffer == NULL) {
-        return PyBuffer_FillInfo(view, unicode,
-                PyUnicode_UTF8(unicode),
-                PyUnicode_UTF8_LENGTH(unicode),
-                /* readonly */ 1, PyBUF_SIMPLE);
-    }
-
-    // Unlike PyUnicode_AsUTF8AndSize(), this function doesn't
-    // create a UTF-8 cache for speed and efficiency.
-    PyObject *bytes = _PyUnicode_AsUTF8String(unicode, errors);
-    if (bytes == NULL) {
-        return -1;
-    }
-    assert(PyBytes_CheckExact(bytes));
-    if (PyObject_GetBuffer(bytes, view, PyBUF_SIMPLE) < 0) {
-        Py_DECREF(bytes);
-        return -1;
-    }
-    return 0;
-}
-
-  
 static int unicode_fill_utf8(PyObject *unicode);
 
 const char *