[Python-checkins] bpo-43086: Add handling for out-of-spec data in a2b_base64 (GH-24402)

gpshead webhook-mailer at python.org
Sun Jul 18 20:45:34 EDT 2021


https://github.com/python/cpython/commit/35b98e38b6edd63153fc8e092f94cb20725dacc1
commit: 35b98e38b6edd63153fc8e092f94cb20725dacc1
branch: main
author: Idan Moral <idan22moral at gmail.com>
committer: gpshead <greg at krypto.org>
date: 2021-07-18T17:45:19-07:00
summary:

bpo-43086: Add handling for out-of-spec data in a2b_base64 (GH-24402)

binascii.a2b_base64 gains a strict_mode= parameter. When enabled it will raise an
error on input that deviates from the base64 spec in any way.  The default remains
False for backward compatibility.

Code reviews and minor tweaks by: Gregory P. Smith <greg at krypto.org> [Google]

files:
A Misc/NEWS.d/next/Library/2021-01-31-18-24-54.bpo-43086.2_P-SH.rst
M Doc/library/binascii.rst
M Lib/test/test_binascii.py
M Modules/binascii.c
M Modules/clinic/binascii.c.h

diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst
index 2c0c1bce5d7f8f..fd5df69e852dc1 100644
--- a/Doc/library/binascii.rst
+++ b/Doc/library/binascii.rst
@@ -50,11 +50,23 @@ The :mod:`binascii` module defines the following functions:
       Added the *backtick* parameter.
 
 
-.. function:: a2b_base64(string)
+.. function:: a2b_base64(string, strict_mode=False)
 
    Convert a block of base64 data back to binary and return the binary data. More
    than one line may be passed at a time.
 
+   If *strict_mode* is true, only valid base64 data will be converted. Invalid base64
+   data will raise :exc:`binascii.Error`.
+
+   Valid base64:
+      * Conforms to :rfc:`3548`.
+      * Contains only characters from the base64 alphabet.
+      * Contains no excess data after padding (including excess padding, newlines, etc.).
+      * Does not start with a padding.
+
+   .. versionchanged:: 3.11
+      Added the *strict_mode* parameter.
+
 
 .. function:: b2a_base64(data, *, newline=True)
 
diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py
index 4d1bf2cce1f1e3..74438d837852c8 100644
--- a/Lib/test/test_binascii.py
+++ b/Lib/test/test_binascii.py
@@ -114,6 +114,47 @@ def addnoise(line):
         # empty strings. TBD: shouldn't it raise an exception instead ?
         self.assertEqual(binascii.a2b_base64(self.type2test(fillers)), b'')
 
+    def test_base64_strict_mode(self):
+        # Test base64 with strict mode on
+        def _assertRegexTemplate(assert_regex: str, data: bytes, non_strict_mode_expected_result: bytes):
+            with self.assertRaisesRegex(binascii.Error, assert_regex):
+                binascii.a2b_base64(self.type2test(data), strict_mode=True)
+            self.assertEqual(binascii.a2b_base64(self.type2test(data), strict_mode=False),
+                             non_strict_mode_expected_result)
+            self.assertEqual(binascii.a2b_base64(self.type2test(data)),
+                             non_strict_mode_expected_result)
+
+        def assertExcessData(data, non_strict_mode_expected_result: bytes):
+            _assertRegexTemplate(r'(?i)Excess data', data, non_strict_mode_expected_result)
+
+        def assertNonBase64Data(data, non_strict_mode_expected_result: bytes):
+            _assertRegexTemplate(r'(?i)Only base64 data', data, non_strict_mode_expected_result)
+
+        def assertMalformedPadding(data, non_strict_mode_expected_result: bytes):
+            _assertRegexTemplate(r'(?i)Leading padding', data, non_strict_mode_expected_result)
+
+        # Test excess data exceptions
+        assertExcessData(b'ab==a', b'i')
+        assertExcessData(b'ab===', b'i')
+        assertExcessData(b'ab==:', b'i')
+        assertExcessData(b'abc=a', b'i\xb7')
+        assertExcessData(b'abc=:', b'i\xb7')
+        assertExcessData(b'ab==\n', b'i')
+
+        # Test non-base64 data exceptions
+        assertNonBase64Data(b'\nab==', b'i')
+        assertNonBase64Data(b'ab:(){:|:&};:==', b'i')
+        assertNonBase64Data(b'a\nb==', b'i')
+        assertNonBase64Data(b'a\x00b==', b'i')
+
+        # Test malformed padding
+        assertMalformedPadding(b'=', b'')
+        assertMalformedPadding(b'==', b'')
+        assertMalformedPadding(b'===', b'')
+        assertMalformedPadding(b'ab=c=', b'i\xb7')
+        assertMalformedPadding(b'ab=ab==', b'i\xb6\x9b')
+
+
     def test_base64errors(self):
         # Test base64 with invalid padding
         def assertIncorrectPadding(data):
diff --git a/Misc/NEWS.d/next/Library/2021-01-31-18-24-54.bpo-43086.2_P-SH.rst b/Misc/NEWS.d/next/Library/2021-01-31-18-24-54.bpo-43086.2_P-SH.rst
new file mode 100644
index 00000000000000..f49e7a84cc5375
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-01-31-18-24-54.bpo-43086.2_P-SH.rst
@@ -0,0 +1,3 @@
+Added a new optional :code:`strict_mode` parameter to *binascii.a2b_base64*.
+When :code:`scrict_mode` is set to :code:`True`, the *a2b_base64* function will accept only valid base64 content.
+More details about what "valid base64 content" is, can be found in the function's documentation.
diff --git a/Modules/binascii.c b/Modules/binascii.c
index 59e4b0a4a784ee..50f25b406a924e 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -433,18 +433,26 @@ binascii.a2b_base64
 
     data: ascii_buffer
     /
+    *
+    strict_mode: bool(accept={int}) = False
 
 Decode a line of base64 data.
+
+  strict_mode
+    When set to True, bytes that are not part of the base64 standard are not allowed.
+    The same applies to excess data after padding (= / ==).
 [clinic start generated code]*/
 
 static PyObject *
-binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
-/*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/
+binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
+/*[clinic end generated code: output=5409557788d4f975 input=3a30c4e3528317c6]*/
 {
     assert(data->len >= 0);
 
     const unsigned char *ascii_data = data->buf;
     size_t ascii_len = data->len;
+    binascii_state *state = NULL;
+    char padding_started = 0;
 
     /* Allocate the buffer */
     Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
@@ -455,6 +463,15 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
         return NULL;
     unsigned char *bin_data_start = bin_data;
 
+    if (strict_mode && ascii_len > 0 && ascii_data[0] == '=') {
+        malformed_padding:
+        state = get_binascii_state(module);
+        if (state) {
+            PyErr_SetString(state->Error, "Leading padding not allowed");
+        }
+        goto error_end;
+    }
+
     int quad_pos = 0;
     unsigned char leftchar = 0;
     int pads = 0;
@@ -465,11 +482,21 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
         ** the invalid ones.
         */
         if (this_ch == BASE64_PAD) {
+            padding_started = 1;
+
             if (quad_pos >= 2 && quad_pos + ++pads >= 4) {
-                /* A pad sequence means no more input.
-                ** We've already interpreted the data
-                ** from the quad at this point.
+                /* A pad sequence means we should not parse more input.
+                ** We've already interpreted the data from the quad at this point.
+                ** in strict mode, an error should raise if there's excess data after the padding.
                 */
+                if (strict_mode && i + 1 < ascii_len) {
+                    state = get_binascii_state(module);
+                    if (state) {
+                        PyErr_SetString(state->Error, "Excess data after padding");
+                    }
+                    goto error_end;
+                }
+
                 goto done;
             }
             continue;
@@ -477,8 +504,20 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
 
         this_ch = table_a2b_base64[this_ch];
         if (this_ch >= 64) {
+            if (strict_mode) {
+                state = get_binascii_state(module);
+                if (state) {
+                    PyErr_SetString(state->Error, "Only base64 data is allowed");
+                }
+                goto error_end;
+            }
             continue;
         }
+
+        // Characters that are not '=', in the middle of the padding, are not allowed
+        if (strict_mode && padding_started) {
+            goto malformed_padding;
+        }
         pads = 0;
 
         switch (quad_pos) {
@@ -505,7 +544,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
     }
 
     if (quad_pos != 0) {
-        binascii_state *state = get_binascii_state(module);
+        state = get_binascii_state(module);
         if (state == NULL) {
             /* error already set, from get_binascii_state */
         } else if (quad_pos == 1) {
@@ -522,6 +561,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
         } else {
             PyErr_SetString(state->Error, "Incorrect padding");
         }
+        error_end:
         _PyBytesWriter_Dealloc(&writer);
         return NULL;
     }
diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h
index ae1c4574325c56..a9240046a59110 100644
--- a/Modules/clinic/binascii.c.h
+++ b/Modules/clinic/binascii.c.h
@@ -87,27 +87,48 @@ binascii_b2a_uu(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj
 }
 
 PyDoc_STRVAR(binascii_a2b_base64__doc__,
-"a2b_base64($module, data, /)\n"
+"a2b_base64($module, data, /, *, strict_mode=False)\n"
 "--\n"
 "\n"
-"Decode a line of base64 data.");
+"Decode a line of base64 data.\n"
+"\n"
+"  strict_mode\n"
+"    When set to True, bytes that are not part of the base64 standard are not allowed.\n"
+"    The same applies to excess data after padding (= / ==).");
 
 #define BINASCII_A2B_BASE64_METHODDEF    \
-    {"a2b_base64", (PyCFunction)binascii_a2b_base64, METH_O, binascii_a2b_base64__doc__},
+    {"a2b_base64", (PyCFunction)(void(*)(void))binascii_a2b_base64, METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base64__doc__},
 
 static PyObject *
-binascii_a2b_base64_impl(PyObject *module, Py_buffer *data);
+binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode);
 
 static PyObject *
-binascii_a2b_base64(PyObject *module, PyObject *arg)
+binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
 {
     PyObject *return_value = NULL;
+    static const char * const _keywords[] = {"", "strict_mode", NULL};
+    static _PyArg_Parser _parser = {NULL, _keywords, "a2b_base64", 0};
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
     Py_buffer data = {NULL, NULL};
+    int strict_mode = 0;
 
-    if (!ascii_buffer_converter(arg, &data)) {
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
+    if (!args) {
         goto exit;
     }
-    return_value = binascii_a2b_base64_impl(module, &data);
+    if (!ascii_buffer_converter(args[0], &data)) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    strict_mode = _PyLong_AsInt(args[1]);
+    if (strict_mode == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = binascii_a2b_base64_impl(module, &data, strict_mode);
 
 exit:
     /* Cleanup for data */
@@ -746,4 +767,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj
 
     return return_value;
 }
-/*[clinic end generated code: output=95a0178f30801b89 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=0f261ee49971f5ca input=a9049054013a1b77]*/



More information about the Python-checkins mailing list