[Python-checkins] r70454 - in python/trunk: Misc/NEWS Modules/_testcapimodule.c Objects/unicodeobject.c

Wed Mar 18 17:07:26 CET 2009

Author: mark.dickinson
Date: Wed Mar 18 17:07:26 2009
New Revision: 70454

Log:
Issue 4474: On platforms with sizeof(wchar_t) == 4 and
sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts
each character outside the BMP to the appropriate surrogate pair.

Thanks Victor Stinner for the patch.

(backport of r70452 from py3k to trunk)



Modified:
   python/trunk/Misc/NEWS
   python/trunk/Modules/_testcapimodule.c
   python/trunk/Objects/unicodeobject.c

Modified: python/trunk/Misc/NEWS
==============================================================================

--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Wed Mar 18 17:07:26 2009
@@ -12,6 +12,10 @@
 Core and Builtins
 -----------------
 
+- Issue #4474: PyUnicode_FromWideChar now converts characters outside
+  the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4
+  and sizeof(Py_UNICODE) == 2.
+
 - Issue #5237: Allow auto-numbered fields in str.format(). For
   example: '{} {}'.format(1, 2) == '1 2'.
 

Modified: python/trunk/Modules/_testcapimodule.c
==============================================================================
--- python/trunk/Modules/_testcapimodule.c	(original)
+++ python/trunk/Modules/_testcapimodule.c	Wed Mar 18 17:07:26 2009
@@ -621,6 +621,48 @@
 }
 
 static PyObject *
+test_widechar(PyObject *self)
+{
+#if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
+	const wchar_t wtext[2] = {(wchar_t)0x10ABCDu};
+	size_t wtextlen = 1;
+#else
+	const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu};
+	size_t wtextlen = 2;
+#endif
+	PyObject *wide, *utf8;
+
+	wide = PyUnicode_FromWideChar(wtext, wtextlen);
+	if (wide == NULL)
+		return NULL;
+
+	utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d");
+	if (utf8 == NULL) {
+		Py_DECREF(wide);
+		return NULL;
+	}
+
+	if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) {
+		Py_DECREF(wide);
+		Py_DECREF(utf8);
+		return raiseTestError("test_widechar",
+				"wide string and utf8 string have different length");
+	}
+	if (PyUnicode_Compare(wide, utf8)) {
+		Py_DECREF(wide);
+		Py_DECREF(utf8);
+		if (PyErr_Occurred())
+			return NULL;
+		return raiseTestError("test_widechar",
+				"wide string and utf8 string are differents");
+	}
+
+	Py_DECREF(wide);
+	Py_DECREF(utf8);
+	Py_RETURN_NONE;
+}
+
+static PyObject *
 test_empty_argparse(PyObject *self)
 {
 	/* Test that formats can begin with '|'. See issue #4720. */
@@ -975,6 +1017,7 @@
 #endif
 #ifdef Py_USING_UNICODE
 	{"test_u_code",		(PyCFunction)test_u_code,	 METH_NOARGS},
+	{"test_widechar",	(PyCFunction)test_widechar,	 METH_NOARGS},
 #endif
 #ifdef WITH_THREAD
 	{"_test_thread_state",  test_thread_state, 		 METH_VARARGS},

Modified: python/trunk/Objects/unicodeobject.c
==============================================================================
--- python/trunk/Objects/unicodeobject.c	(original)
+++ python/trunk/Objects/unicodeobject.c	Wed Mar 18 17:07:26 2009
@@ -529,6 +529,60 @@
 
 #ifdef HAVE_WCHAR_H
 
+#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
+# define CONVERT_WCHAR_TO_SURROGATES
+#endif
+
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+
+/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
+   to convert from UTF32 to UTF16. */
+
+PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
+                                 Py_ssize_t size)
+{
+    PyUnicodeObject *unicode;
+    register Py_ssize_t i;
+    Py_ssize_t alloc;
+    const wchar_t *orig_w;
+
+    if (w == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+
+    alloc = size;
+    orig_w = w;
+    for (i = size; i > 0; i--) {
+        if (*w > 0xFFFF)
+            alloc++;
+        w++;
+    }
+    w = orig_w;
+    unicode = _PyUnicode_New(alloc);
+    if (!unicode)
+        return NULL;
+
+    /* Copy the wchar_t data into the new object */
+    {
+        register Py_UNICODE *u;
+        u = PyUnicode_AS_UNICODE(unicode);
+        for (i = size; i > 0; i--) {
+            if (*w > 0xFFFF) {
+                wchar_t ordinal = *w++;
+                ordinal -= 0x10000;
+                *u++ = 0xD800 | (ordinal >> 10);
+                *u++ = 0xDC00 | (ordinal & 0x3FF);
+            }
+            else
+                *u++ = *w++;
+        }
+    }
+    return (PyObject *)unicode;
+}
+
+#else
+
 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
                                  Py_ssize_t size)
 {
@@ -559,6 +613,10 @@
     return (PyObject *)unicode;
 }
 
+#endif /* CONVERT_WCHAR_TO_SURROGATES */
+
+#undef CONVERT_WCHAR_TO_SURROGATES
+
 static void
 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 {