[Python-checkins] gh-95781: More strict format string checking in PyUnicode_FromFormatV() (GH-95784)

serhiy-storchaka webhook-mailer at python.org
Mon Aug 8 12:21:37 EDT 2022


https://github.com/python/cpython/commit/62f06508e76e023a81861caee6a45e1d639bf530
commit: 62f06508e76e023a81861caee6a45e1d639bf530
branch: main
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-08-08T19:21:07+03:00
summary:

gh-95781: More strict format string checking in PyUnicode_FromFormatV() (GH-95784)

An unrecognized format character in PyUnicode_FromFormat() and
PyUnicode_FromFormatV() now sets a SystemError.
In previous versions it caused all the rest of the format string to be
copied as-is to the result string, and any extra arguments discarded.

files:
A Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst
M Doc/c-api/unicode.rst
M Doc/whatsnew/3.12.rst
M Lib/test/test_unicode.py
M Objects/unicodeobject.c

diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 339ee35c7aa4..99afebd762a4 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -477,9 +477,6 @@ APIs:
    |                   |                     | :c:func:`PyObject_Repr`.         |
    +-------------------+---------------------+----------------------------------+
 
-   An unrecognized format character causes all the rest of the format string to be
-   copied as-is to the result string, and any extra arguments discarded.
-
    .. note::
       The width formatter unit is number of characters rather than bytes.
       The precision formatter unit is number of bytes for ``"%s"`` and
@@ -500,6 +497,11 @@ APIs:
       Support width and precision formatter for ``"%s"``, ``"%A"``, ``"%U"``,
       ``"%V"``, ``"%S"``, ``"%R"`` added.
 
+   .. versionchanged:: 3.12
+      An unrecognized format character now sets a :exc:`SystemError`.
+      In previous versions it caused all the rest of the format string to be
+      copied as-is to the result string, and any extra arguments discarded.
+
 
 .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs)
 
diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst
index f1696cc4584c..6df122acba71 100644
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@@ -469,6 +469,12 @@ Porting to Python 3.12
   :py:meth:`~class.__subclasses__` (using :c:func:`PyObject_CallMethod`,
   for example).
 
+* An unrecognized format character in :c:func:`PyUnicode_FromFormat` and
+  :c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`.
+  In previous versions it caused all the rest of the format string to be
+  copied as-is to the result string, and any extra arguments discarded.
+  (Contributed by Serhiy Storchaka in :gh:`95781`.)
+
 
 Deprecated
 ----------
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 9765ed97a60a..63bccb72e046 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -2641,8 +2641,6 @@ def check_format(expected, format, *args):
                      b'%c%c', c_int(0x10000), c_int(0x100000))
 
         # test "%"
-        check_format('%',
-                     b'%')
         check_format('%',
                      b'%%')
         check_format('%s',
@@ -2819,23 +2817,22 @@ def check_format(expected, format, *args):
         check_format('repr=abc\ufffd',
                      b'repr=%V', None, b'abc\xff')
 
-        # not supported: copy the raw format string. these tests are just here
-        # to check for crashes and should not be considered as specifications
-        check_format('%s',
-                     b'%1%s', b'abc')
-        check_format('%1abc',
-                     b'%1abc')
-        check_format('%+i',
-                     b'%+i', c_int(10))
-        check_format('%.%s',
-                     b'%.%s', b'abc')
-
         # Issue #33817: empty strings
         check_format('',
                      b'')
         check_format('',
                      b'%s', b'')
 
+        # check for crashes
+        for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1',
+                    b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc',
+                    b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'):
+            with self.subTest(fmt=fmt):
+                self.assertRaisesRegex(SystemError, 'invalid format string',
+                    PyUnicode_FromFormat, fmt, b'abc')
+        self.assertRaisesRegex(SystemError, 'invalid format string',
+            PyUnicode_FromFormat, b'%+i', c_int(10))
+
     # Test PyUnicode_AsWideChar()
     @support.cpython_only
     @unittest.skipIf(_testcapi is None, 'need _testcapi module')
diff --git a/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst b/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst
new file mode 100644
index 000000000000..eb2fd7e9da3d
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst	
@@ -0,0 +1,4 @@
+An unrecognized format character in :c:func:`PyUnicode_FromFormat` and
+:c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`.
+In previous versions it caused all the rest of the format string to be
+copied as-is to the result string, and any extra arguments discarded.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7ff79953257e..184a2bfd5dd8 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2355,6 +2355,13 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
 
     p = f;
     f++;
+    if (*f == '%') {
+        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
+            return NULL;
+        f++;
+        return f;
+    }
+
     zeropad = 0;
     if (*f == '0') {
         zeropad = 1;
@@ -2392,14 +2399,6 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
                 f++;
             }
         }
-        if (*f == '%') {
-            /* "%.3%s" => f points to "3" */
-            f--;
-        }
-    }
-    if (*f == '\0') {
-        /* bogus format "%.123" => go backward, f points to "3" */
-        f--;
     }
 
     /* Handle %ld, %lu, %lld and %llu. */
@@ -2423,7 +2422,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
         ++f;
     }
 
-    if (f[1] == '\0')
+    if (f[0] != '\0' && f[1] == '\0')
         writer->overallocate = 0;
 
     switch (*f) {
@@ -2616,21 +2615,9 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
         break;
     }
 
-    case '%':
-        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
-            return NULL;
-        break;
-
     default:
-        /* if we stumble upon an unknown formatting code, copy the rest
-           of the format string to the output string. (we cannot just
-           skip the code, since there's no way to know what's in the
-           argument list) */
-        len = strlen(p);
-        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
-            return NULL;
-        f = p+len;
-        return f;
+        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
+        return NULL;
     }
 
     f++;



More information about the Python-checkins mailing list