[Python-checkins] closes bpo-32285: Add unicodedata.is_normalized. (GH-4806)

Benjamin Peterson webhook-mailer at python.org
Sun Nov 4 18:58:28 EST 2018


https://github.com/python/cpython/commit/2810dd7be9876236f74ac80716d113572c9098dd
commit: 2810dd7be9876236f74ac80716d113572c9098dd
branch: master
author: Max Bélanger <aeromax at gmail.com>
committer: Benjamin Peterson <benjamin at python.org>
date: 2018-11-04T15:58:24-08:00
summary:

closes bpo-32285: Add unicodedata.is_normalized. (GH-4806)

files:
A Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
M Doc/library/unicodedata.rst
M Doc/whatsnew/3.8.rst
M Lib/test/test_normalization.py
M Modules/clinic/unicodedata.c.h
M Modules/unicodedata.c

diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst
index 59548f3e8b4a..17e848bf552b 100644
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -133,6 +133,13 @@ following functions:
    a human reader, if one has combining characters and the other
    doesn't, they may not compare equal.
 
+.. function:: is_normalized(form, unistr)
+
+   Return whether the Unicode string *unistr* is in the normal form *form*. Valid
+   values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
+
+   .. versionadded:: 3.8
+
 
 In addition, the module exposes the following constant:
 
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 5397206030fe..566c369c85bd 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
 in the :class:`tkinter.Canvas` class.
 (Contributed by Juliette Monsel in :issue:`23831`.)
 
+unicodedata
+-----------
+
+* New function :func:`~unicodedata.is_normalized` can be used to verify a string
+  is in a specific normal form. (Contributed by Max Belanger and David Euresti in
+  :issue:`32285`).
+
 venv
 ----
 
diff --git a/Lib/test/test_normalization.py b/Lib/test/test_normalization.py
index 304245644502..ba877e73f7d9 100644
--- a/Lib/test/test_normalization.py
+++ b/Lib/test/test_normalization.py
@@ -3,7 +3,7 @@
 
 from http.client import HTTPException
 import sys
-from unicodedata import normalize, unidata_version
+from unicodedata import normalize, is_normalized, unidata_version
 
 TESTDATAFILE = "NormalizationTest.txt"
 TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
@@ -88,6 +88,15 @@ def run_normalization_tests(self, testdata):
                             NFKD(c3) == NFKD(c4) == NFKD(c5),
                             line)
 
+            self.assertTrue(is_normalized("NFC", c2))
+            self.assertTrue(is_normalized("NFC", c4))
+
+            self.assertTrue(is_normalized("NFD", c3))
+            self.assertTrue(is_normalized("NFD", c5))
+
+            self.assertTrue(is_normalized("NFKC", c4))
+            self.assertTrue(is_normalized("NFKD", c5))
+
             # Record part 1 data
             if part == "@Part1":
                 part1_data[c1] = 1
diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst b/Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
new file mode 100644
index 000000000000..87f84b02eb84
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst	
@@ -0,0 +1,2 @@
+New function unicodedata.is_normalized, which can check whether a string is
+in a specific normal form.
diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h
index 72e3f6545778..54021fedba41 100644
--- a/Modules/clinic/unicodedata.c.h
+++ b/Modules/clinic/unicodedata.c.h
@@ -284,6 +284,38 @@ unicodedata_UCD_decomposition(PyObject *self, PyObject *arg)
     return return_value;
 }
 
+PyDoc_STRVAR(unicodedata_UCD_is_normalized__doc__,
+"is_normalized($self, form, unistr, /)\n"
+"--\n"
+"\n"
+"Return whether the Unicode string unistr is in the normal form \'form\'.\n"
+"\n"
+"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'.");
+
+#define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF    \
+    {"is_normalized", (PyCFunction)unicodedata_UCD_is_normalized, METH_FASTCALL, unicodedata_UCD_is_normalized__doc__},
+
+static PyObject *
+unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
+                                   PyObject *input);
+
+static PyObject *
+unicodedata_UCD_is_normalized(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
+{
+    PyObject *return_value = NULL;
+    PyObject *form;
+    PyObject *input;
+
+    if (!_PyArg_ParseStack(args, nargs, "UU:is_normalized",
+        &form, &input)) {
+        goto exit;
+    }
+    return_value = unicodedata_UCD_is_normalized_impl(self, form, input);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
 "normalize($self, form, unistr, /)\n"
 "--\n"
@@ -296,17 +328,17 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
     {"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__},
 
 static PyObject *
-unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
+unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
                                PyObject *input);
 
 static PyObject *
 unicodedata_UCD_normalize(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
 {
     PyObject *return_value = NULL;
-    const char *form;
+    PyObject *form;
     PyObject *input;
 
-    if (!_PyArg_ParseStack(args, nargs, "sU:normalize",
+    if (!_PyArg_ParseStack(args, nargs, "UU:normalize",
         &form, &input)) {
         goto exit;
     }
@@ -379,4 +411,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=dc899bff0ecd14c1 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2c5fbf597c18f6b8 input=a9049054013a1b77]*/
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index e8788f5036dd..9ceab1b3db4f 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -19,6 +19,11 @@
 #include "ucnhash.h"
 #include "structmember.h"
 
+_Py_IDENTIFIER(NFC);
+_Py_IDENTIFIER(NFD);
+_Py_IDENTIFIER(NFKC);
+_Py_IDENTIFIER(NFKD);
+
 /*[clinic input]
 module unicodedata
 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
     return result;
 }
 
-/* Return 1 if the input is certainly normalized, 0 if it might not be. */
-static int
+typedef enum {YES, NO, MAYBE} NormalMode;
+
+/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
+static NormalMode
 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 {
     Py_ssize_t i, len;
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
     /* An older version of the database is requested, quickchecks must be
        disabled. */
     if (self && UCD_Check(self))
-        return 0;
+        return NO;
 
     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
        as described in http://unicode.org/reports/tr15/#Annex8. */
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
         unsigned char quickcheck = record->normalization_quick_check;
 
         if (quickcheck & quickcheck_mask)
-            return 0; /* this string might need normalization */
+            return MAYBE; /* this string might need normalization */
         if (combining && prev_combining > combining)
-            return 0; /* non-canonical sort order, not normalized */
+            return NO; /* non-canonical sort order, not normalized */
         prev_combining = combining;
     }
-    return 1; /* certainly normalized */
+    return YES; /* certainly normalized */
+}
+
+/*[clinic input]
+unicodedata.UCD.is_normalized
+
+    self: self
+    form: unicode
+    unistr as input: unicode
+    /
+
+Return whether the Unicode string unistr is in the normal form 'form'.
+
+Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
+                                   PyObject *input)
+/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
+{
+    if (PyUnicode_READY(input) == -1) {
+        return NULL;
+    }
+
+    if (PyUnicode_GET_LENGTH(input) == 0) {
+        /* special case empty input strings. */
+        Py_RETURN_TRUE;
+    }
+
+    PyObject *result;
+    int nfc = 0;
+    int k = 0;
+    NormalMode m;
+
+    PyObject *cmp;
+    int match = 0;
+
+    if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
+        nfc = 1;
+    }
+    else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
+        nfc = 1;
+        k = 1;
+    }
+    else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
+        /* matches default values for `nfc` and `k` */
+    }
+    else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
+        k = 1;
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "invalid normalization form");
+        return NULL;
+    }
+
+    m = is_normalized(self, input, nfc, k);
+
+    if (m == MAYBE) {
+        cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
+        if (cmp == NULL) {
+            return NULL;
+        }
+        match = PyUnicode_Compare(input, cmp);
+        Py_DECREF(cmp);
+        result = (match == 0) ? Py_True : Py_False;
+    }
+    else {
+        result = (m == YES) ? Py_True : Py_False;
+    }
+
+    Py_INCREF(result);
+    return result;
 }
 
+
 /*[clinic input]
 unicodedata.UCD.normalize
 
     self: self
-    form: str
+    form: unicode
     unistr as input: unicode
     /
 
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
 [clinic start generated code]*/
 
 static PyObject *
-unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
+unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
                                PyObject *input)
-/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
+/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
 {
     if (PyUnicode_GET_LENGTH(input) == 0) {
         /* Special case empty input strings, since resizing
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
         return input;
     }
 
-    if (strcmp(form, "NFC") == 0) {
-        if (is_normalized(self, input, 1, 0)) {
+    if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
+        if (is_normalized(self, input, 1, 0) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 0);
     }
-    if (strcmp(form, "NFKC") == 0) {
-        if (is_normalized(self, input, 1, 1)) {
+    if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
+        if (is_normalized(self, input, 1, 1) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 1);
     }
-    if (strcmp(form, "NFD") == 0) {
-        if (is_normalized(self, input, 0, 0)) {
+    if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
+        if (is_normalized(self, input, 0, 0) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfd_nfkd(self, input, 0);
     }
-    if (strcmp(form, "NFKD") == 0) {
-        if (is_normalized(self, input, 0, 1)) {
+    if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
+        if (is_normalized(self, input, 0, 1) == YES) {
             Py_INCREF(input);
             return input;
         }
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
     UNICODEDATA_UCD_NAME_METHODDEF
     UNICODEDATA_UCD_LOOKUP_METHODDEF
+    UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
     UNICODEDATA_UCD_NORMALIZE_METHODDEF
     {NULL, NULL}                /* sentinel */
 };



More information about the Python-checkins mailing list