[Python-checkins] r63899 - in python/trunk: Lib/test/test_unicodedata.py Modules/unicodedata.c

walter.doerwald python-checkins at python.org
Mon Jun 2 22:36:39 CEST 2008


Author: walter.doerwald
Date: Mon Jun  2 22:36:03 2008
New Revision: 63899

Log:
Change all functions that expect one unicode character to accept a pair of
surrogates in narrow builds. Fixes issue #1706460.


Modified:
   python/trunk/Lib/test/test_unicodedata.py
   python/trunk/Modules/unicodedata.c

Modified: python/trunk/Lib/test/test_unicodedata.py
==============================================================================
--- python/trunk/Lib/test/test_unicodedata.py	(original)
+++ python/trunk/Lib/test/test_unicodedata.py	Mon Jun  2 22:36:03 2008
@@ -103,6 +103,7 @@
         self.assertEqual(self.db.digit(u'9'), 9)
         self.assertEqual(self.db.digit(u'\u215b', None), None)
         self.assertEqual(self.db.digit(u'\u2468'), 9)
+        self.assertEqual(self.db.digit(u'\U00020000', None), None)
 
         self.assertRaises(TypeError, self.db.digit)
         self.assertRaises(TypeError, self.db.digit, u'xx')
@@ -113,6 +114,7 @@
         self.assertEqual(self.db.numeric(u'9'), 9)
         self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
         self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
+        self.assertEqual(self.db.numeric(u'\U00020000', None), None)
 
         self.assertRaises(TypeError, self.db.numeric)
         self.assertRaises(TypeError, self.db.numeric, u'xx')
@@ -123,6 +125,7 @@
         self.assertEqual(self.db.decimal(u'9'), 9)
         self.assertEqual(self.db.decimal(u'\u215b', None), None)
         self.assertEqual(self.db.decimal(u'\u2468', None), None)
+        self.assertEqual(self.db.decimal(u'\U00020000', None), None)
 
         self.assertRaises(TypeError, self.db.decimal)
         self.assertRaises(TypeError, self.db.decimal, u'xx')
@@ -132,6 +135,7 @@
         self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
         self.assertEqual(self.db.category(u'a'), 'Ll')
         self.assertEqual(self.db.category(u'A'), 'Lu')
+        self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
 
         self.assertRaises(TypeError, self.db.category)
         self.assertRaises(TypeError, self.db.category, u'xx')
@@ -140,6 +144,7 @@
         self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
         self.assertEqual(self.db.bidirectional(u' '), 'WS')
         self.assertEqual(self.db.bidirectional(u'A'), 'L')
+        self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
 
         self.assertRaises(TypeError, self.db.bidirectional)
         self.assertRaises(TypeError, self.db.bidirectional, u'xx')
@@ -155,6 +160,7 @@
         self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
         self.assertEqual(self.db.mirrored(u'a'), 0)
         self.assertEqual(self.db.mirrored(u'\u2201'), 1)
+        self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
 
         self.assertRaises(TypeError, self.db.mirrored)
         self.assertRaises(TypeError, self.db.mirrored, u'xx')
@@ -163,6 +169,7 @@
         self.assertEqual(self.db.combining(u'\uFFFE'), 0)
         self.assertEqual(self.db.combining(u'a'), 0)
         self.assertEqual(self.db.combining(u'\u20e1'), 230)
+        self.assertEqual(self.db.combining(u'\U00020000'), 0)
 
         self.assertRaises(TypeError, self.db.combining)
         self.assertRaises(TypeError, self.db.combining, u'xx')
@@ -185,6 +192,7 @@
         self.assertEqual(eaw(u'\uFF66'), 'H')
         self.assertEqual(eaw(u'\uFF1F'), 'F')
         self.assertEqual(eaw(u'\u2010'), 'A')
+        self.assertEqual(eaw(u'\U00020000'), 'W')
 
 class UnicodeMiscTest(UnicodeDatabaseTest):
 

Modified: python/trunk/Modules/unicodedata.c
==============================================================================
--- python/trunk/Modules/unicodedata.c	(original)
+++ python/trunk/Modules/unicodedata.c	Mon Jun  2 22:36:03 2008
@@ -54,12 +54,6 @@
     return &_PyUnicode_Database_Records[index];
 }
 
-static const _PyUnicode_DatabaseRecord*
-_getrecord(PyUnicodeObject* v)
-{
-    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
-}
-
 /* ------------- Previous-version API ------------------------------------- */
 typedef struct previous_version {
     PyObject_HEAD
@@ -92,6 +86,24 @@
 	return (PyObject*)self;
 }
 
+
+static Py_UCS4 getuchar(PyUnicodeObject *obj)
+{
+    Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
+
+    if (PyUnicode_GET_SIZE(obj) == 1)
+	return *v;
+#ifndef Py_UNICODE_WIDE
+    else if ((PyUnicode_GET_SIZE(obj) == 2) &&
+             (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
+             (0xDC00 <= v[1] && v[1] <= 0xDFFF))
+	return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
+#endif
+    PyErr_SetString(PyExc_TypeError,
+                    "need a single Unicode character as parameter");
+    return (Py_UCS4)-1;
+}
+
 /* --- Module API --------------------------------------------------------- */
 
 PyDoc_STRVAR(unicodedata_decimal__doc__,
@@ -108,17 +120,16 @@
     PyObject *defobj = NULL;
     int have_old = 0;
     long rc;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
         return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
         return NULL;
-    }
 
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0) {
             /* unassigned */
             have_old = 1;
@@ -131,7 +142,7 @@
     }
 
     if (!have_old)
-        rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
+        rc = Py_UNICODE_TODECIMAL(c);
     if (rc < 0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError,
@@ -159,15 +170,14 @@
     PyUnicodeObject *v;
     PyObject *defobj = NULL;
     long rc;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
         return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
         return NULL;
-    }
-    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
+    rc = Py_UNICODE_TODIGIT(c);
     if (rc < 0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "not a digit");
@@ -195,17 +205,16 @@
     PyObject *defobj = NULL;
     int have_old = 0;
     double rc;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
         return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
 
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0) {
             /* unassigned */
             have_old = 1;
@@ -218,7 +227,7 @@
     }
 
     if (!have_old)
-        rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
+        rc = Py_UNICODE_TONUMERIC(c);
     if (rc == -1.0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
@@ -243,18 +252,17 @@
 {
     PyUnicodeObject *v;
     int index;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!:category",
 			  &PyUnicode_Type, &v))
 	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->category;
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
+    index = (int) _getrecord_ex(c)->category;
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed != 0xFF)
             index = old->category_changed;
     }
@@ -273,18 +281,17 @@
 {
     PyUnicodeObject *v;
     int index;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 			  &PyUnicode_Type, &v))
 	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->bidirectional;
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
+    index = (int) _getrecord_ex(c)->bidirectional;
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
         else if (old->bidir_changed != 0xFF)
@@ -305,18 +312,17 @@
 {
     PyUnicodeObject *v;
     int index;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!:combining",
 			  &PyUnicode_Type, &v))
 	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->combining;
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
+    index = (int) _getrecord_ex(c)->combining;
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
     }
@@ -335,18 +341,17 @@
 {
     PyUnicodeObject *v;
     int index;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!:mirrored",
 			  &PyUnicode_Type, &v))
 	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->mirrored;
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
+    index = (int) _getrecord_ex(c)->mirrored;
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
     }
@@ -364,18 +369,17 @@
 {
     PyUnicodeObject *v;
     int index;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 			  &PyUnicode_Type, &v))
 	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->east_asian_width;
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
+    index = (int) _getrecord_ex(c)->east_asian_width;
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
     }
@@ -396,20 +400,19 @@
     char decomp[256];
     int code, index, count, i;
     unsigned int prefix_index;
+    Py_UCS4 c;
 
     if (!PyArg_ParseTuple(args, "O!:decomposition",
 			  &PyUnicode_Type, &v))
 	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
 
-    code = (int) *PyUnicode_AS_UNICODE(v);
+    code = (int)c;
 
     if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             return PyBytes_FromString(""); /* unassigned */
     }
@@ -1039,20 +1042,18 @@
 unicodedata_name(PyObject* self, PyObject* args)
 {
     char name[NAME_MAXLEN];
+    Py_UCS4 c;
 
     PyUnicodeObject* v;
     PyObject* defobj = NULL;
     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
         return NULL;
 
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
+    c = getuchar(v);
+    if (c == (Py_UCS4)-1)
+        return NULL;
 
-    if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
-                    name, sizeof(name))) {
+    if (!_getucname(self, c, name, sizeof(name))) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "no such name");
             return NULL;


More information about the Python-checkins mailing list