[Python-checkins] bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)

Serhiy Storchaka webhook-mailer at python.org
Tue May 12 09:18:11 EDT 2020


https://github.com/python/cpython/commit/5650e76f63a6f4ec55d00ec13f143d84a2efee39
commit: 5650e76f63a6f4ec55d00ec13f143d84a2efee39
branch: master
author: Serhiy Storchaka <storchaka at gmail.com>
committer: GitHub <noreply at github.com>
date: 2020-05-12T16:18:00+03:00
summary:

bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)

files:
A Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst
M Lib/test/test_unicode.py
M Objects/unicodeobject.c

diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 2839889646789..2ee4e64d63530 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -720,6 +720,13 @@ def test_isidentifier(self):
         self.assertFalse("©".isidentifier())
         self.assertFalse("0".isidentifier())
 
+    @support.cpython_only
+    def test_isidentifier_legacy(self):
+        import _testcapi
+        u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
+        self.assertTrue(u.isidentifier())
+        self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
+
     def test_isprintable(self):
         self.assertTrue("".isprintable())
         self.assertTrue(" ".isprintable())
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst
new file mode 100644
index 0000000000000..1252db4dc9848
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst	
@@ -0,0 +1,2 @@
+Fixed :meth:`str.isidentifier` for non-canonicalized strings containing
+non-BMP characters on Windows.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 276547ca48a5b..826298c23a924 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self)
         return len && i == len;
     }
     else {
-        Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
+        Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
         if (len == 0) {
             /* an empty string is not a valid identifier */
             return 0;
         }
 
         const wchar_t *wstr = _PyUnicode_WSTR(self);
-        Py_UCS4 ch = wstr[0];
+        Py_UCS4 ch = wstr[i++];
+#if SIZEOF_WCHAR_T == 2
+        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
+            && i < len
+            && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
+        {
+            ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
+            i++;
+        }
+#endif
         if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
             return 0;
         }
 
-        for (i = 1; i < len; i++) {
-            ch = wstr[i];
+        while (i < len) {
+            ch = wstr[i++];
+#if SIZEOF_WCHAR_T == 2
+            if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
+                && i < len
+                && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
+            {
+                ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
+                i++;
+            }
+#endif
             if (!_PyUnicode_IsXidContinue(ch)) {
                 return 0;
             }



More information about the Python-checkins mailing list