[pypy-commit] cffi default: Write a copy of wchar_helper.h that uses CPython 3.3's new

Mon Jun 19 06:09:59 EDT 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r2985:5e554e5c0dfc
Date: 2017-06-19 12:06 +0200
http://bitbucket.org/cffi/cffi/changeset/5e554e5c0dfc/

Log:	Write a copy of wchar_helper.h that uses CPython 3.3's new unicode
	string API. It makes sense anyway for speed reasons, but it also
	avoids even more special cases for platforms with 16-bit wchar_t on
	CPython >= 3.3.

diff --git a/c/_cffi_backend.c b/c/_cffi_backend.c
--- a/c/_cffi_backend.c
+++ b/c/_cffi_backend.c
@@ -286,7 +286,11 @@
 # include "file_emulator.h"
 #endif
 
-#include "wchar_helper.h"
+#ifdef PyUnicode_KIND     /* Python >= 3.3 */
+# include "wchar_helper_3.h"
+#else
+# include "wchar_helper.h"
+#endif
 
 #include "../cffi/_cffi_errors.h"
 
diff --git a/c/wchar_helper_3.h b/c/wchar_helper_3.h
new file mode 100644
--- /dev/null
+++ b/c/wchar_helper_3.h
@@ -0,0 +1,155 @@
+/*
+ * wchar_t helpers, version CPython >= 3.3.
+ *
+ * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
+ * platforms, even ones with wchar_t limited to 2 bytes.  As such,
+ * this code here works from the outside like wchar_helper.h in the
+ * case Py_UNICODE_SIZE == 4, but the implementation is very different.
+ */
+
+typedef uint16_t cffi_char16_t;
+typedef uint32_t cffi_char32_t;
+
+
+static PyObject *
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
+{
+    return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
+}
+
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+    /* are there any surrogate pairs, and if so, how many? */
+    Py_ssize_t i, count_surrogates = 0;
+    for (i = 0; i < size - 1; i++) {
+        if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
+                0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
+            count_surrogates++;
+    }
+    if (count_surrogates == 0) {
+        /* no, fast path */
+        return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
+    }
+    else
+    {
+        PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
+        Py_UCS4 *data;
+        assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
+        data = PyUnicode_4BYTE_DATA(result);
+
+        for (i = 0; i < size; i++)
+        {
+            cffi_char32_t ch = w[i];
+            if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
+                cffi_char32_t ch2 = w[i + 1];
+                if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+                    i++;
+                }
+            }
+            *data++ = ch;
+        }
+        return result;
+    }
+}
+
+static int
+_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
+                             char *err_got)
+{
+    cffi_char32_t ch;
+    if (PyUnicode_GET_LENGTH(unicode) != 1) {
+        sprintf(err_got, "unicode string of length %zd",
+                PyUnicode_GET_LENGTH(unicode));
+        return -1;
+    }
+    ch = PyUnicode_READ_CHAR(unicode, 0);
+
+    if (ch > 0xFFFF)
+    {
+        sprintf(err_got, "larger-than-0xFFFF character");
+        return -1;
+    }
+    *result = (cffi_char16_t)ch;
+    return 0;
+}
+
+static int
+_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
+                             char *err_got)
+{
+    if (PyUnicode_GET_LENGTH(unicode) != 1) {
+        sprintf(err_got, "unicode string of length %zd",
+                PyUnicode_GET_LENGTH(unicode));
+        return -1;
+    }
+    *result = PyUnicode_READ_CHAR(unicode, 0);
+    return 0;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
+{
+    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
+    Py_ssize_t result = length;
+    unsigned int kind = PyUnicode_KIND(unicode);
+
+    if (kind == PyUnicode_4BYTE_KIND)
+    {
+        Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
+        Py_ssize_t i;
+        for (i = 0; i < length; i++) {
+            if (data[i] > 0xFFFF)
+                result++;
+        }
+    }
+    return result;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
+{
+    return PyUnicode_GET_LENGTH(unicode);
+}
+
+static int _my_PyUnicode_AsChar16(PyObject *unicode,
+                                  cffi_char16_t *result,
+                                  Py_ssize_t resultlen)
+{
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+    unsigned int kind = PyUnicode_KIND(unicode);
+    void *data = PyUnicode_DATA(unicode);
+    Py_ssize_t i;
+
+    for (i = 0; i < len; i++) {
+        cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
+        if (ordinal > 0xFFFF) {
+            if (ordinal > 0x10FFFF) {
+                PyErr_Format(PyExc_ValueError,
+                             "unicode character out of range for "
+                             "conversion to char16_t: 0x%x", (int)ordinal);
+                return -1;
+            }
+            ordinal -= 0x10000;
+            *result++ = 0xD800 | (ordinal >> 10);
+            *result++ = 0xDC00 | (ordinal & 0x3FF);
+        }
+        else
+            *result++ = ordinal;
+    }
+    return 0;
+}
+
+static int _my_PyUnicode_AsChar32(PyObject *unicode,
+                                  cffi_char32_t *result,
+                                  Py_ssize_t resultlen)
+{
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+    unsigned int kind = PyUnicode_KIND(unicode);
+    void *data = PyUnicode_DATA(unicode);
+    Py_ssize_t i;
+
+    for (i = 0; i < len; i++)
+        result[i] = PyUnicode_READ(kind, data, i);
+
+    return 0;
+}