[Python-checkins] cpython: Issue #19676: Added the "namereplace" error handler.

Tue Nov 25 13:14:28 CET 2014

https://hg.python.org/cpython/rev/32d08aacffe0
changeset:   93581:32d08aacffe0
parent:      93577:c2b36196b7f5
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Tue Nov 25 13:57:17 2014 +0200
summary:
  Issue #19676: Added the "namereplace" error handler.

files:
  Doc/c-api/codec.rst             |    5 +
  Doc/howto/unicode.rst           |    7 +-
  Doc/library/codecs.rst          |   17 +++
  Doc/library/functions.rst       |    3 +
  Doc/library/io.rst              |    7 +-
  Include/codecs.h                |    3 +
  Lib/codecs.py                   |    3 +
  Lib/test/test_codeccallbacks.py |  100 ++++++++++++++++++-
  Lib/test/test_codecs.py         |    7 +
  Misc/NEWS                       |    2 +
  Python/codecs.c                 |  108 ++++++++++++++++++++
  11 files changed, 255 insertions(+), 7 deletions(-)

diff --git a/Doc/c-api/codec.rst b/Doc/c-api/codec.rst
--- a/Doc/c-api/codec.rst
+++ b/Doc/c-api/codec.rst
@@ -116,3 +116,8 @@
    Replace the unicode encode error with backslash escapes (``\x``, ``\u`` and
    ``\U``).
 
+.. c:function:: PyObject* PyCodec_NameReplaceErrors(PyObject *exc)
+
+   Replace the unicode encode error with `\N{...}` escapes.
+
+  .. versionadded: 3.4
diff --git a/Doc/howto/unicode.rst b/Doc/howto/unicode.rst
--- a/Doc/howto/unicode.rst
+++ b/Doc/howto/unicode.rst
@@ -325,8 +325,9 @@
 :meth:`~bytes.decode` method but supports a few more possible handlers. As well as
 ``'strict'``, ``'ignore'``, and ``'replace'`` (which in this case
 inserts a question mark instead of the unencodable character), there is
-also ``'xmlcharrefreplace'`` (inserts an XML character reference) and
-``backslashreplace`` (inserts a ``\uNNNN`` escape sequence).
+also ``'xmlcharrefreplace'`` (inserts an XML character reference),
+``backslashreplace`` (inserts a ``\uNNNN`` escape sequence) and
+``namereplace`` (inserts a ``\N{...}`` escape sequence).
 
 The following example shows the different results::
 
@@ -346,6 +347,8 @@
     b'ꀀabcd޴'
     >>> u.encode('ascii', 'backslashreplace')
     b'\\ua000abcd\\u07b4'
+    >>> u.encode('ascii', 'namereplace')
+    b'\\N{YI SYLLABLE IT}abcd\\u07b4'
 
 The low-level routines for registering and accessing the available
 encodings are found in the :mod:`codecs` module.  Implementing new
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -98,6 +98,8 @@
      reference (for encoding only)
    * ``'backslashreplace'``: replace with backslashed escape sequences (for
      encoding only)
+   * ``'namereplace'``: replace with ``\N{...}`` escape sequences (for
+     encoding only)
    * ``'surrogateescape'``: on decoding, replace with code points in the Unicode
      Private Use Area ranging from U+DC80 to U+DCFF.  These private code
      points will then be turned back into the same bytes when the
@@ -232,6 +234,11 @@
    Implements the ``backslashreplace`` error handling (for encoding only): the
    unencodable character is replaced by a backslashed escape sequence.
 
+.. function:: namereplace_errors(exception)
+
+   Implements the ``namereplace`` error handling (for encoding only): the
+   unencodable character is replaced by a ``\N{...}`` escape sequence.
+
 To simplify working with encoded files or stream, the module also defines these
 utility functions:
 
@@ -363,6 +370,9 @@
 | ``'backslashreplace'``  | Replace with backslashed escape sequences     |
 |                         | (only for encoding).                          |
 +-------------------------+-----------------------------------------------+
+| ``'namereplace'``       | Replace with ``\N{...}`` escape sequences     |
+|                         | (only for encoding).                          |
++-------------------------+-----------------------------------------------+
 | ``'surrogateescape'``   | Replace byte with surrogate U+DCxx, as defined|
 |                         | in :pep:`383`.                                |
 +-------------------------+-----------------------------------------------+
@@ -384,6 +394,9 @@
 .. versionchanged:: 3.4
    The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
 
+.. versionadded:: 3.4
+   The ``'namereplace'`` error handler.
+
 The set of allowed values can be extended via :meth:`register_error`.
 
 
@@ -477,6 +490,8 @@
 
    * ``'backslashreplace'`` Replace with backslashed escape sequences.
 
+   * ``'namereplace'`` Replace with ``\N{...}`` escape sequences.
+
    The *errors* argument will be assigned to an attribute of the same name.
    Assigning to this attribute makes it possible to switch between different error
    handling strategies during the lifetime of the :class:`IncrementalEncoder`
@@ -625,6 +640,8 @@
 
    * ``'backslashreplace'`` Replace with backslashed escape sequences.
 
+   * ``'namereplace'`` Replace with ``\N{...}`` escape sequences.
+
    The *errors* argument will be assigned to an attribute of the same name.
    Assigning to this attribute makes it possible to switch between different error
    handling strategies during the lifetime of the :class:`StreamWriter` object.
diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst
--- a/Doc/library/functions.rst
+++ b/Doc/library/functions.rst
@@ -975,6 +975,9 @@
      replaces unsupported characters with Python's backslashed escape
      sequences.
 
+   * ``'namereplace'`` (also only supported when writing)
+     replaces unsupported characters with ``\N{...}`` escape sequences.
+
    .. index::
       single: universal newlines; open() built-in function
 
diff --git a/Doc/library/io.rst b/Doc/library/io.rst
--- a/Doc/library/io.rst
+++ b/Doc/library/io.rst
@@ -827,9 +827,10 @@
    errors can lead to data loss.)  ``'replace'`` causes a replacement marker
    (such as ``'?'``) to be inserted where there is malformed data.  When
    writing, ``'xmlcharrefreplace'`` (replace with the appropriate XML character
-   reference) or ``'backslashreplace'`` (replace with backslashed escape
-   sequences) can be used.  Any other error handling name that has been
-   registered with :func:`codecs.register_error` is also valid.
+   reference), ``'backslashreplace'`` (replace with backslashed escape
+   sequences) or ``'namereplace'`` (replace with ``\N{...}`` escape sequences)
+   can be used.  Any other error handling name that has been registered with
+   :func:`codecs.register_error` is also valid.
 
    .. index::
       single: universal newlines; io.TextIOWrapper class
diff --git a/Include/codecs.h b/Include/codecs.h
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -225,6 +225,9 @@
 /* replace the unicode encode error with backslash escapes (\x, \u and \U) */
 PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
 
+/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
+
 PyAPI_DATA(const char *) Py_hexdigits;
 
 #ifdef __cplusplus
diff --git a/Lib/codecs.py b/Lib/codecs.py
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -22,6 +22,7 @@
            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
            "strict_errors", "ignore_errors", "replace_errors",
            "xmlcharrefreplace_errors",
+           "backslashreplace_errors", "namereplace_errors",
            "register_error", "lookup_error"]
 
 ### Constants
@@ -1085,6 +1086,7 @@
     replace_errors = lookup_error("replace")
     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
     backslashreplace_errors = lookup_error("backslashreplace")
+    namereplace_errors = lookup_error("namereplace")
 except LookupError:
     # In --disable-unicode builds, these error handler are missing
     strict_errors = None
@@ -1092,6 +1094,7 @@
     replace_errors = None
     xmlcharrefreplace_errors = None
     backslashreplace_errors = None
+    namereplace_errors = None
 
 # Tell modulefinder that using codecs probably needs the encodings
 # package
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -158,6 +158,22 @@
         sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
         self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
 
+    def test_nameescape(self):
+        # Does the same as backslashescape, but prefers ``\N{...}`` escape
+        # sequences.
+        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
+        sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
+                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
+        self.assertEqual(sin.encode("ascii", "namereplace"), sout)
+
+        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
+                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
+        self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
+
+        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
+                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
+        self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
+
     def test_decoding_callbacks(self):
         # This is a test for a decoding callback handler
         # that allows the decoding of the invalid sequence
@@ -297,7 +313,7 @@
     def test_longstrings(self):
         # test long strings to check for memory overflow problems
         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
-                   "backslashreplace"]
+                   "backslashreplace", "namereplace"]
         # register the handlers under different names,
         # to prevent the codec from recognizing the name
         for err in errors:
@@ -611,6 +627,81 @@
                 ("\\udfff", 1)
             )
 
+    def test_badandgoodnamereplaceexceptions(self):
+        # "namereplace" complains about a non-exception passed in
+        self.assertRaises(
+           TypeError,
+           codecs.namereplace_errors,
+           42
+        )
+        # "namereplace" complains about the wrong exception types
+        self.assertRaises(
+           TypeError,
+           codecs.namereplace_errors,
+           UnicodeError("ouch")
+        )
+        # "namereplace" can only be used for encoding
+        self.assertRaises(
+            TypeError,
+            codecs.namereplace_errors,
+            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
+        )
+        self.assertRaises(
+            TypeError,
+            codecs.namereplace_errors,
+            UnicodeTranslateError("\u3042", 0, 1, "ouch")
+        )
+        # Use the correct exception
+        self.assertEqual(
+            codecs.namereplace_errors(
+                UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
+            ("\\N{HIRAGANA LETTER A}", 1)
+        )
+        self.assertEqual(
+            codecs.namereplace_errors(
+                UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")),
+            ("\\x00", 1)
+        )
+        self.assertEqual(
+            codecs.namereplace_errors(
+                UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")),
+            ("\\N{LATIN SMALL LETTER Y WITH DIAERESIS}", 1)
+        )
+        self.assertEqual(
+            codecs.namereplace_errors(
+                UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")),
+            ("\\N{LATIN CAPITAL LETTER A WITH MACRON}", 1)
+        )
+        self.assertEqual(
+            codecs.namereplace_errors(
+                UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
+            ("\\uffff", 1)
+        )
+        if SIZEOF_WCHAR_T > 0:
+            self.assertEqual(
+                codecs.namereplace_errors(
+                    UnicodeEncodeError("ascii", "\U00010000",
+                                       0, 1, "ouch")),
+                ("\\N{LINEAR B SYLLABLE B008 A}", 1)
+            )
+            self.assertEqual(
+                codecs.namereplace_errors(
+                    UnicodeEncodeError("ascii", "\U0010ffff",
+                                       0, 1, "ouch")),
+                ("\\U0010ffff", 1)
+            )
+            # Lone surrogates (regardless of unicode width)
+            self.assertEqual(
+                codecs.namereplace_errors(
+                    UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
+                ("\\ud800", 1)
+            )
+            self.assertEqual(
+                codecs.namereplace_errors(
+                    UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
+                ("\\udfff", 1)
+            )
+
     def test_badhandlerresults(self):
         results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
         encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
@@ -651,6 +742,10 @@
             codecs.backslashreplace_errors,
             codecs.lookup_error("backslashreplace")
         )
+        self.assertEqual(
+            codecs.namereplace_errors,
+            codecs.lookup_error("namereplace")
+        )
 
     def test_unencodablereplacement(self):
         def unencrepl(exc):
@@ -804,7 +899,8 @@
         class D(dict):
             def __getitem__(self, key):
                 raise ValueError
-        for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
+        for err in ("strict", "replace", "xmlcharrefreplace",
+                    "backslashreplace", "namereplace", "test.posreturn"):
             self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
             self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
             self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -349,6 +349,8 @@
         self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
         self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
                          "[\\udc80]".encode(self.encoding))
+        self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
+                         "[\\udc80]".encode(self.encoding))
         self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
                          "[�]".encode(self.encoding))
         self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
@@ -808,6 +810,7 @@
                 ('\udc80', 'ignore', b''),
                 ('\udc80', 'replace', b'?'),
                 ('\udc80', 'backslashreplace', b'\\udc80'),
+                ('\udc80', 'namereplace', b'\\udc80'),
                 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
             ))
         else:
@@ -869,6 +872,8 @@
         self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
         self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
                          b'[\\udc80]')
+        self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
+                         b'[\\udc80]')
         self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
                          b'[�]')
         self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
@@ -2824,6 +2829,8 @@
             ('[\xff]', 'replace', b'[y]'),
             ('[\u20ac]', 'replace', b'[?]'),
             ('[\xff]', 'backslashreplace', b'[\\xff]'),
+            ('[\xff]', 'namereplace',
+             b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
             ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'),
             ('\udcff', 'strict', None),
             ('[\udcff]', 'surrogateescape', b'[\xff]'),
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -191,6 +191,8 @@
 Library
 -------
 
+- Issue #19676: Added the "namereplace" error handler.
+
 - Issue #22788: Add *context* parameter to logging.handlers.HTTPHandler.
 
 - Issue #22921: Allow SSLContext to take the *hostname* parameter even if
diff --git a/Python/codecs.c b/Python/codecs.c
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -9,6 +9,7 @@
    ------------------------------------------------------------------------ */
 
 #include "Python.h"
+#include "ucnhash.h"
 #include <ctype.h>
 
 const char *Py_hexdigits = "0123456789abcdef";
@@ -933,6 +934,97 @@
     }
 }
 
+static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
+static int ucnhash_initialized = 0;
+
+PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
+{
+    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+        PyObject *restuple;
+        PyObject *object;
+        Py_ssize_t i;
+        Py_ssize_t start;
+        Py_ssize_t end;
+        PyObject *res;
+        unsigned char *outp;
+        int ressize;
+        Py_UCS4 c;
+        char buffer[256]; /* NAME_MAXLEN */
+        if (PyUnicodeEncodeError_GetStart(exc, &start))
+            return NULL;
+        if (PyUnicodeEncodeError_GetEnd(exc, &end))
+            return NULL;
+        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+            return NULL;
+        if (!ucnhash_initialized) {
+            /* load the unicode data module */
+            ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
+                                            PyUnicodeData_CAPSULE_NAME, 1);
+            ucnhash_initialized = 1;
+        }
+        for (i = start, ressize = 0; i < end; ++i) {
+            /* object is guaranteed to be "ready" */
+            c = PyUnicode_READ_CHAR(object, i);
+            if (ucnhash_CAPI &&
+                ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
+                ressize += 1+1+1+strlen(buffer)+1;
+            }
+            else if (c >= 0x10000) {
+                ressize += 1+1+8;
+            }
+            else if (c >= 0x100) {
+                ressize += 1+1+4;
+            }
+            else
+                ressize += 1+1+2;
+        }
+        res = PyUnicode_New(ressize, 127);
+        if (res==NULL)
+            return NULL;
+        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
+            i < end; ++i) {
+            c = PyUnicode_READ_CHAR(object, i);
+            *outp++ = '\\';
+            if (ucnhash_CAPI &&
+                ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
+                *outp++ = 'N';
+                *outp++ = '{';
+                strcpy((char *)outp, buffer);
+                outp += strlen(buffer);
+                *outp++ = '}';
+                continue;
+            }
+            if (c >= 0x00010000) {
+                *outp++ = 'U';
+                *outp++ = Py_hexdigits[(c>>28)&0xf];
+                *outp++ = Py_hexdigits[(c>>24)&0xf];
+                *outp++ = Py_hexdigits[(c>>20)&0xf];
+                *outp++ = Py_hexdigits[(c>>16)&0xf];
+                *outp++ = Py_hexdigits[(c>>12)&0xf];
+                *outp++ = Py_hexdigits[(c>>8)&0xf];
+            }
+            else if (c >= 0x100) {
+                *outp++ = 'u';
+                *outp++ = Py_hexdigits[(c>>12)&0xf];
+                *outp++ = Py_hexdigits[(c>>8)&0xf];
+            }
+            else
+                *outp++ = 'x';
+            *outp++ = Py_hexdigits[(c>>4)&0xf];
+            *outp++ = Py_hexdigits[c&0xf];
+        }
+
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        restuple = Py_BuildValue("(Nn)", res, end);
+        Py_DECREF(object);
+        return restuple;
+    }
+    else {
+        wrong_exception_type(exc);
+        return NULL;
+    }
+}
+
 #define ENC_UNKNOWN     -1
 #define ENC_UTF8        0
 #define ENC_UTF16BE     1
@@ -1276,6 +1368,11 @@
     return PyCodec_BackslashReplaceErrors(exc);
 }
 
+static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
+{
+    return PyCodec_NameReplaceErrors(exc);
+}
+
 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
 {
     return PyCodec_SurrogatePassErrors(exc);
@@ -1346,6 +1443,17 @@
             }
         },
         {
+            "namereplace",
+            {
+                "namereplace_errors",
+                namereplace_errors,
+                METH_O,
+                PyDoc_STR("Implements the 'namereplace' error handling, "
+                          "which replaces an unencodable character with a "
+                          "\\N{...} escape sequence.")
+            }
+        },
+        {
             "surrogatepass",
             {
                 "surrogatepass",

-- 
Repository URL: https://hg.python.org/cpython