[pypy-commit] cffi default: Write a copy of wchar_helper.h that uses CPython 3.3's new
arigo
pypy.commits at gmail.com
Mon Jun 19 06:09:59 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r2985:5e554e5c0dfc
Date: 2017-06-19 12:06 +0200
http://bitbucket.org/cffi/cffi/changeset/5e554e5c0dfc/
Log: Write a copy of wchar_helper.h that uses CPython 3.3's new unicode
string API. It makes sense anyway for speed reasons, but it also
avoids even more special cases for platforms with 16-bit wchar_t on
CPython >= 3.3.
diff --git a/c/_cffi_backend.c b/c/_cffi_backend.c
--- a/c/_cffi_backend.c
+++ b/c/_cffi_backend.c
@@ -286,7 +286,11 @@
# include "file_emulator.h"
#endif
-#include "wchar_helper.h"
+#ifdef PyUnicode_KIND /* Python >= 3.3 */
+# include "wchar_helper_3.h"
+#else
+# include "wchar_helper.h"
+#endif
#include "../cffi/_cffi_errors.h"
diff --git a/c/wchar_helper_3.h b/c/wchar_helper_3.h
new file mode 100644
--- /dev/null
+++ b/c/wchar_helper_3.h
@@ -0,0 +1,155 @@
+/*
+ * wchar_t helpers, version CPython >= 3.3.
+ *
+ * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
+ * platforms, even ones with wchar_t limited to 2 bytes. As such,
+ * this code here works from the outside like wchar_helper.h in the
+ * case Py_UNICODE_SIZE == 4, but the implementation is very different.
+ */
+
+typedef uint16_t cffi_char16_t;
+typedef uint32_t cffi_char32_t;
+
+
+static PyObject *
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
+{
+ return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
+}
+
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+ /* are there any surrogate pairs, and if so, how many? */
+ Py_ssize_t i, count_surrogates = 0;
+ for (i = 0; i < size - 1; i++) {
+ if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
+ 0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
+ count_surrogates++;
+ }
+ if (count_surrogates == 0) {
+ /* no, fast path */
+ return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
+ }
+ else
+ {
+ PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
+ Py_UCS4 *data;
+ assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
+ data = PyUnicode_4BYTE_DATA(result);
+
+ for (i = 0; i < size; i++)
+ {
+ cffi_char32_t ch = w[i];
+ if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
+ cffi_char32_t ch2 = w[i + 1];
+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ i++;
+ }
+ }
+ *data++ = ch;
+ }
+ return result;
+ }
+}
+
+static int
+_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
+ char *err_got)
+{
+ cffi_char32_t ch;
+ if (PyUnicode_GET_LENGTH(unicode) != 1) {
+ sprintf(err_got, "unicode string of length %zd",
+ PyUnicode_GET_LENGTH(unicode));
+ return -1;
+ }
+ ch = PyUnicode_READ_CHAR(unicode, 0);
+
+ if (ch > 0xFFFF)
+ {
+ sprintf(err_got, "larger-than-0xFFFF character");
+ return -1;
+ }
+ *result = (cffi_char16_t)ch;
+ return 0;
+}
+
+static int
+_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
+ char *err_got)
+{
+ if (PyUnicode_GET_LENGTH(unicode) != 1) {
+ sprintf(err_got, "unicode string of length %zd",
+ PyUnicode_GET_LENGTH(unicode));
+ return -1;
+ }
+ *result = PyUnicode_READ_CHAR(unicode, 0);
+ return 0;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
+{
+ Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
+ Py_ssize_t result = length;
+ unsigned int kind = PyUnicode_KIND(unicode);
+
+ if (kind == PyUnicode_4BYTE_KIND)
+ {
+ Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
+ Py_ssize_t i;
+ for (i = 0; i < length; i++) {
+ if (data[i] > 0xFFFF)
+ result++;
+ }
+ }
+ return result;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
+{
+ return PyUnicode_GET_LENGTH(unicode);
+}
+
+static int _my_PyUnicode_AsChar16(PyObject *unicode,
+ cffi_char16_t *result,
+ Py_ssize_t resultlen)
+{
+ Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+ unsigned int kind = PyUnicode_KIND(unicode);
+ void *data = PyUnicode_DATA(unicode);
+ Py_ssize_t i;
+
+ for (i = 0; i < len; i++) {
+ cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
+ if (ordinal > 0xFFFF) {
+ if (ordinal > 0x10FFFF) {
+ PyErr_Format(PyExc_ValueError,
+ "unicode character out of range for "
+ "conversion to char16_t: 0x%x", (int)ordinal);
+ return -1;
+ }
+ ordinal -= 0x10000;
+ *result++ = 0xD800 | (ordinal >> 10);
+ *result++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *result++ = ordinal;
+ }
+ return 0;
+}
+
+static int _my_PyUnicode_AsChar32(PyObject *unicode,
+ cffi_char32_t *result,
+ Py_ssize_t resultlen)
+{
+ Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+ unsigned int kind = PyUnicode_KIND(unicode);
+ void *data = PyUnicode_DATA(unicode);
+ Py_ssize_t i;
+
+ for (i = 0; i < len; i++)
+ result[i] = PyUnicode_READ(kind, data, i);
+
+ return 0;
+}
More information about the pypy-commit
mailing list