[pypy-commit] cffi wchar_t: in-progress

Mon Jul 9 12:31:46 CEST 2012

Author: Armin Rigo <arigo at tunes.org>
Branch: wchar_t
Changeset: r603:c64975b8743a
Date: 2012-07-09 12:31 +0200
http://bitbucket.org/cffi/cffi/changeset/c64975b8743a/

Log:	in-progress

diff --git a/c/_cffi_backend.c b/c/_cffi_backend.c
--- a/c/_cffi_backend.c
+++ b/c/_cffi_backend.c
@@ -157,6 +157,10 @@
 # endif
 #endif
 
+#ifdef HAVE_WCHAR_H
+# include "wchar_helper.h"
+#endif
+
 /************************************************************/
 
 static CTypeDescrObject *
@@ -604,8 +608,10 @@
     else if (ct->ct_flags & CT_PRIMITIVE_CHAR) {
         if (ct->ct_size == sizeof(char))
             return PyString_FromStringAndSize(data, 1);
+#ifdef HAVE_WCHAR_H
         else
-            return PyUnicode_FromWideChar((wchar_t *)data, 1);
+            return _my_PyUnicode_FromWideChar((wchar_t *)data, 1);
+#endif
     }
 
     PyErr_Format(PyExc_SystemError,
@@ -677,10 +683,13 @@
     return -1;
 }
 
+#ifdef HAVE_WCHAR_H
 static wchar_t _convert_to_wchar_t(PyObject *init)
 {
-    if (PyUnicode_Check(init) && PyUnicode_GET_SIZE(init) == 1) {
-        return (wchar_t)(PyUnicode_AS_UNICODE(init)[0]);
+    if (PyUnicode_Check(init)) {
+        wchar_t ordinal;
+        if (_my_PyUnicode_AsSingleWideChar(init, &ordinal) == 0)
+            return ordinal;
     }
     if (CData_Check(init) &&
            (((CDataObject *)init)->c_type->ct_flags & CT_PRIMITIVE_CHAR) &&
@@ -692,6 +701,7 @@
                  "of length 1, not %.200s", Py_TYPE(init)->tp_name);
     return (wchar_t)-1;
 }
+#endif
 
 static int _convert_error(PyObject *init, const char *ct_name,
                           const char *expected)
@@ -855,12 +865,14 @@
                 return -1;
             data[0] = res;
         }
+#ifdef HAVE_WCHAR_H
         else {
             wchar_t res = _convert_to_wchar_t(init);
             if (res == (wchar_t)-1 && PyErr_Occurred())
                 return -1;
             *(wchar_t *)data = res;
         }
+#endif
         return 0;
     }
     if (ct->ct_flags & (CT_STRUCT|CT_UNION)) {
@@ -1092,11 +1104,13 @@
 
 static PyObject *cdata_str(CDataObject *cd)
 {
-    if (cd->c_type->ct_flags & CT_PRIMITIVE_CHAR) {
+    if (cd->c_type->ct_flags & CT_PRIMITIVE_CHAR &&
+        cd->c_type->ct_size == sizeof(char)) {
         return PyString_FromStringAndSize(cd->c_data, 1);
     }
     else if (cd->c_type->ct_itemdescr != NULL &&
-             cd->c_type->ct_itemdescr->ct_flags & CT_PRIMITIVE_CHAR) {
+             cd->c_type->ct_itemdescr->ct_flags & CT_PRIMITIVE_CHAR &&
+             cd->c_type->ct_itemdescr->ct_size == sizeof(char)) {
         Py_ssize_t length;
 
         if (cd->c_type->ct_flags & CT_ARRAY) {
@@ -1129,6 +1143,48 @@
         return cdata_repr(cd);
 }
 
+#ifdef HAVE_WCHAR_H
+static PyObject *cdata_unicode(CDataObject *cd)
+{
+    if (cd->c_type->ct_flags & CT_PRIMITIVE_CHAR &&
+        cd->c_type->ct_size > sizeof(char)) {
+        return _my_PyUnicode_FromWideChar((wchar_t *)cd->c_data, 1);
+    }
+    else if (cd->c_type->ct_itemdescr != NULL &&
+             cd->c_type->ct_itemdescr->ct_flags & CT_PRIMITIVE_CHAR &&
+             cd->c_type->ct_itemdescr->ct_size > sizeof(char)) {
+        abort();
+        Py_ssize_t length;
+
+        if (cd->c_type->ct_flags & CT_ARRAY) {
+            const char *start = cd->c_data;
+            const char *end;
+            length = get_array_length(cd);
+            end = (const char *)memchr(start, 0, length);
+            if (end != NULL)
+                length = end - start;
+        }
+        else {
+            if (cd->c_data == NULL) {
+                PyObject *s = cdata_repr(cd);
+                if (s != NULL) {
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "cannot use str() on %s",
+                                 PyString_AS_STRING(s));
+                    Py_DECREF(s);
+                }
+                return NULL;
+            }
+            length = strlen(cd->c_data);
+        }
+
+        return PyString_FromStringAndSize(cd->c_data, length);
+    }
+    else
+        return cdata_repr(cd);
+}
+#endif
+
 static PyObject *cdataowning_repr(CDataObject *cd)
 {
     Py_ssize_t size;
@@ -1670,6 +1726,11 @@
     (objobjargproc)cdata_ass_sub, /*mp_ass_subscript*/
 };
 
+static PyMethodDef CData_methods[] = {
+    {"__unicode__",     (PyCFunction)cdata_unicode,  METH_NOARGS},
+    {NULL,              NULL}           /* sentinel */
+};
+
 static PyTypeObject CData_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     "_cffi_backend.CData",
@@ -1697,6 +1758,8 @@
     cdata_richcompare,                          /* tp_richcompare */
     0,                                          /* tp_weaklistoffset */
     (getiterfunc)cdata_iter,                    /* tp_iter */
+    0,                                          /* tp_iternext */
+    CData_methods,                              /* tp_methods */
 };
 
 static PyTypeObject CDataOwning_Type = {
@@ -2307,12 +2370,18 @@
        EPTYPE(ul, unsigned long, CT_PRIMITIVE_UNSIGNED )        \
        EPTYPE(ull, unsigned long long, CT_PRIMITIVE_UNSIGNED )  \
        EPTYPE(f, float, CT_PRIMITIVE_FLOAT )                    \
-       EPTYPE(d, double, CT_PRIMITIVE_FLOAT )                   \
+       EPTYPE(d, double, CT_PRIMITIVE_FLOAT )
+#ifdef HAVE_WCHAR_H
+# define ENUM_PRIMITIVE_TYPES_WCHAR                             \
        EPTYPE(wc, wchar_t, CT_PRIMITIVE_CHAR )
+#else
+# define ENUM_PRIMITIVE_TYPES_WCHAR   /* nothing */
+#endif
 
 #define EPTYPE(code, typename, flags)                   \
     struct aligncheck_##code { char x; typename y; };
     ENUM_PRIMITIVE_TYPES
+    ENUM_PRIMITIVE_TYPES_WCHAR
 #undef EPTYPE
 
     CTypeDescrObject *td;
@@ -2326,7 +2395,9 @@
           flags                                         \
         },
     ENUM_PRIMITIVE_TYPES
+    ENUM_PRIMITIVE_TYPES_WCHAR
 #undef EPTYPE
+#undef ENUM_PRIMITIVE_TYPES_WCHAR
 #undef ENUM_PRIMITIVE_TYPES
         { NULL }
     };
diff --git a/c/test_c.py b/c/test_c.py
--- a/c/test_c.py
+++ b/c/test_c.py
@@ -1278,6 +1278,14 @@
     BWChar = new_primitive_type("wchar_t")
     pyuni4 = {1: True, 2: False}[len(u'\U00012345')]
     wchar4 = {2: False, 4: True}[sizeof(BWChar)]
+    assert str(cast(BWChar, 0x45)) == "<cdata 'wchar_t' u'E'>"
+    assert str(cast(BWChar, 0x1234)) == "<cdata 'wchar_t' u'\u1234'>"
+    if wchar4:
+        x = cast(BWChar, 0x12345)
+        assert str(x) == "<cdata 'wchar_t' u'\U00012345'>"
+        assert unicode(x) == u'\U00012345'
+    else:
+        assert not pyuni4
     #
     BWCharP = new_pointer_type(BWChar)
     BStruct = new_struct_type("foo_s")
diff --git a/c/wchar_helper.h b/c/wchar_helper.h
new file mode 100644
--- /dev/null
+++ b/c/wchar_helper.h
@@ -0,0 +1,82 @@
+/*
+ * wchar_t helpers
+ */
+
+#if (Py_UNICODE_SIZE == 2) && (SIZEOF_WCHAR_T == 4)
+# define CONVERT_WCHAR_TO_SURROGATES
+#endif
+
+
+#if PY_VERSION_HEX < 0x02070000 && defined(CONVERT_WCHAR_TO_SURROGATES)
+
+/* Before Python 2.7, PyUnicode_FromWideChar is not able to convert
+   wchar_t values greater than 65535 into two-unicode-characters surrogates.
+*/
+static PyObject *
+_my_PyUnicode_FromWideChar(register const wchar_t *w,
+                           Py_ssize_t size)
+{
+    PyObject *unicode;
+    register Py_ssize_t i;
+    Py_ssize_t alloc;
+    const wchar_t *orig_w;
+
+    if (w == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+
+    alloc = size;
+    orig_w = w;
+    for (i = size; i > 0; i--) {
+        if (*w > 0xFFFF)
+            alloc++;
+        w++;
+    }
+    w = orig_w;
+    unicode = PyUnicode_FromUnicode(NULL, alloc);
+    if (!unicode)
+        return NULL;
+
+    /* Copy the wchar_t data into the new object */
+    {
+        register Py_UNICODE *u;
+        u = PyUnicode_AS_UNICODE(unicode);
+        for (i = size; i > 0; i--) {
+            if (*w > 0xFFFF) {
+                wchar_t ordinal = *w++;
+                ordinal -= 0x10000;
+                *u++ = 0xD800 | (ordinal >> 10);
+                *u++ = 0xDC00 | (ordinal & 0x3FF);
+            }
+            else
+                *u++ = *w++;
+        }
+    }
+    return unicode;
+}
+
+#else
+
+# define _my_PyUnicode_FromWideChar PyUnicode_FromWideChar
+
+#endif
+
+
+static int _my_PyUnicode_AsSingleWideChar(PyObject *unicode, wchar_t *result)
+{
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+    if (PyUnicode_GET_SIZE(unicode) == 1) {
+        *result = (wchar_t)(u[0]);
+        return 0;
+    }
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+    if (PyUnicode_GET_SIZE(unicode) == 2 &&
+            0xD800 <= u[0] && u[0] <= 0xDBFF &&
+            0xDC00 <= u[1] && u[1] <= 0xDFFF) {
+        *result = 0x10000 + ((u[0] - 0xD800) << 10) + (u[1] - 0xDC00);
+        return 0;
+    }
+#endif
+    return -1;
+}