[pypy-commit] cffi default: When sizeof(wchar_t) == 4 but we are using 2-bytes unicode characters in

arigo noreply at buildbot.pypy.org
Mon Jul 9 20:15:45 CEST 2012


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r615:d0cc2c334761
Date: 2012-07-09 20:15 +0200
http://bitbucket.org/cffi/cffi/changeset/d0cc2c334761/

Log:	When sizeof(wchar_t) == 4 but we are using 2-bytes unicode
	characters in Python, even the 2.7 version of
	PyUnicode_FromWideChar() fails to detect values that are too large
	to be encoded as surrogates, and returns nonsense. In a "better
	safe than sorry" effort, raise ValueError in this case.

diff --git a/c/test_c.py b/c/test_c.py
--- a/c/test_c.py
+++ b/c/test_c.py
@@ -1393,6 +1393,13 @@
     f = callback(BFunc, cb, -42)
     #assert f(u'a\u1234b') == 3    -- not implemented
     py.test.raises(NotImplementedError, f, u'a\u1234b')
+    #
+    if wchar4:
+        # try out-of-range wchar_t values
+        x = cast(BWChar, 1114112)
+        py.test.raises(ValueError, unicode, x)
+        x = cast(BWChar, -1)
+        py.test.raises(ValueError, unicode, x)
 
 def test_keepalive_struct():
     # exception to the no-keepalive rule: p=newp(BStructPtr) returns a
diff --git a/c/wchar_helper.h b/c/wchar_helper.h
--- a/c/wchar_helper.h
+++ b/c/wchar_helper.h
@@ -7,10 +7,12 @@
 #endif
 
 
-#if PY_VERSION_HEX < 0x02070000 && defined(CONVERT_WCHAR_TO_SURROGATES)
+#ifdef CONVERT_WCHAR_TO_SURROGATES
 
 /* Before Python 2.7, PyUnicode_FromWideChar is not able to convert
    wchar_t values greater than 65535 into two-unicode-characters surrogates.
+   But even the Python 2.7 version doesn't detect wchar_t values that are
+   out of range(1114112), and just returns nonsense.
 */
 static PyObject *
 _my_PyUnicode_FromWideChar(register const wchar_t *w,
@@ -43,8 +45,16 @@
         register Py_UNICODE *u;
         u = PyUnicode_AS_UNICODE(unicode);
         for (i = size; i > 0; i--) {
-            if (*w > 0xFFFF) {
-                wchar_t ordinal = *w++;
+            if (((unsigned int)*w) > 0xFFFF) {
+                wchar_t ordinal;
+                if (((unsigned int)*w) > 0x10FFFF) {
+                    PyErr_Format(PyExc_ValueError,
+                                 "wchar_t out of range for "
+                                 "convertion to unicode: 0x%x", (int)*w);
+                    Py_DECREF(unicode);
+                    return NULL;
+                }
+                ordinal = *w++;
                 ordinal -= 0x10000;
                 *u++ = 0xD800 | (ordinal >> 10);
                 *u++ = 0xDC00 | (ordinal & 0x3FF);


More information about the pypy-commit mailing list