[Python-checkins] bpo-38256: Fix binascii.crc32() when inputs are 4+GiB (GH-32000)

gpshead webhook-mailer at python.org
Sun Mar 20 15:28:40 EDT 2022


https://github.com/python/cpython/commit/9d1c4d69dbc800ac344565119337fcf490cdc800
commit: 9d1c4d69dbc800ac344565119337fcf490cdc800
branch: main
author: Gregory P. Smith <greg at krypto.org>
committer: gpshead <greg at krypto.org>
date: 2022-03-20T12:28:15-07:00
summary:

bpo-38256: Fix binascii.crc32() when inputs are 4+GiB (GH-32000)

When compiled with `USE_ZLIB_CRC32` defined (`configure` sets this on POSIX systems), `binascii.crc32(...)` failed to compute the correct value when the input data was >= 4GiB. Because the zlib crc32 API is limited to a 32-bit length.

This lines it up with the `zlib.crc32(...)` implementation that doesn't have that flaw.

**Performance:** This also adopts the same GIL releasing for larger inputs logic that `zlib.crc32` has, and causes the Windows build to always use zlib's crc32 instead of our slow C code as zlib is a required build dependency on Windows.

files:
A Misc/NEWS.d/next/Library/2022-03-19-15-54-41.bpo-38256.FoMbjE.rst
M Lib/test/test_binascii.py
M Modules/binascii.c
M Modules/clinic/zlibmodule.c.h
M Modules/zlibmodule.c
M PCbuild/pythoncore.vcxproj

diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py
index b5aa847b943e6..7087d7a471d3f 100644
--- a/Lib/test/test_binascii.py
+++ b/Lib/test/test_binascii.py
@@ -4,7 +4,7 @@
 import binascii
 import array
 import re
-from test.support import warnings_helper
+from test.support import bigmemtest, _1G, _4G, warnings_helper
 
 
 # Note: "*_hex" functions are aliases for "(un)hexlify"
@@ -441,6 +441,14 @@ class BytearrayBinASCIITest(BinASCIITest):
 class MemoryviewBinASCIITest(BinASCIITest):
     type2test = memoryview
 
+class ChecksumBigBufferTestCase(unittest.TestCase):
+    """bpo-38256 - check that inputs >=4 GiB are handled correctly."""
+
+    @bigmemtest(size=_4G + 4, memuse=1, dry_run=False)
+    def test_big_buffer(self, size):
+        data = b"nyan" * (_1G + 1)
+        self.assertEqual(binascii.crc32(data), 1044521549)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2022-03-19-15-54-41.bpo-38256.FoMbjE.rst b/Misc/NEWS.d/next/Library/2022-03-19-15-54-41.bpo-38256.FoMbjE.rst
new file mode 100644
index 0000000000000..ea1763fca4138
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-19-15-54-41.bpo-38256.FoMbjE.rst
@@ -0,0 +1,14 @@
+Fix :func:`binascii.crc32` when it is compiled to use zlib'c crc32 to
+work properly on inputs 4+GiB in length instead of returning the wrong
+result. The workaround prior to this was to always feed the function
+data in increments smaller than 4GiB or to just call the zlib module
+function.
+
+We also have :func:`binascii.crc32` release the GIL when computing
+on larger inputs as :func:`zlib.crc32` and :mod:`hashlib` do.
+
+This also boosts performance on Windows as it now uses the zlib crc32
+implementation for :func:`binascii.crc32` for a 2-3x speedup.
+
+That the stdlib has a crc32 API in two modules is a known historical
+oddity. This moves us closer to a single implementation behind them.
diff --git a/Modules/binascii.c b/Modules/binascii.c
index fec0d82a39cdd..afe4988549171 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -737,6 +737,21 @@ static const unsigned int crc_32_tab[256] = {
 0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
 0x2d02ef8dU
 };
+
+static unsigned int
+internal_crc32(const unsigned char *bin_data, Py_ssize_t len, unsigned int crc)
+{ /* By Jim Ahlstrom; All rights transferred to CNRI */
+    unsigned int result;
+
+    crc = ~ crc;
+    while (len-- > 0) {
+        crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
+        /* Note:  (crc >> 8) MUST zero fill on left */
+    }
+
+    result = (crc ^ 0xFFFFFFFF);
+    return result & 0xffffffff;
+}
 #endif  /* USE_ZLIB_CRC32 */
 
 /*[clinic input]
@@ -754,34 +769,46 @@ binascii_crc32_impl(PyObject *module, Py_buffer *data, unsigned int crc)
 /*[clinic end generated code: output=52cf59056a78593b input=bbe340bc99d25aa8]*/
 
 #ifdef USE_ZLIB_CRC32
-/* This was taken from zlibmodule.c PyZlib_crc32 (but is PY_SSIZE_T_CLEAN) */
+/* This is the same as zlibmodule.c zlib_crc32_impl. It exists in two
+ * modules for historical reasons. */
 {
-    const Byte *buf;
-    Py_ssize_t len;
-    int signed_val;
-
-    buf = (Byte*)data->buf;
-    len = data->len;
-    signed_val = crc32(crc, buf, len);
-    return (unsigned int)signed_val & 0xffffffffU;
+    /* Releasing the GIL for very small buffers is inefficient
+       and may lower performance */
+    if (data->len > 1024*5) {
+        unsigned char *buf = data->buf;
+        Py_ssize_t len = data->len;
+
+        Py_BEGIN_ALLOW_THREADS
+        /* Avoid truncation of length for very large buffers. crc32() takes
+           length as an unsigned int, which may be narrower than Py_ssize_t. */
+        while ((size_t)len > UINT_MAX) {
+            crc = crc32(crc, buf, UINT_MAX);
+            buf += (size_t) UINT_MAX;
+            len -= (size_t) UINT_MAX;
+        }
+        crc = crc32(crc, buf, (unsigned int)len);
+        Py_END_ALLOW_THREADS
+    } else {
+        crc = crc32(crc, data->buf, (unsigned int)data->len);
+    }
+    return crc & 0xffffffff;
 }
 #else  /* USE_ZLIB_CRC32 */
-{ /* By Jim Ahlstrom; All rights transferred to CNRI */
-    const unsigned char *bin_data;
-    Py_ssize_t len;
-    unsigned int result;
-
-    bin_data = data->buf;
-    len = data->len;
-
-    crc = ~ crc;
-    while (len-- > 0) {
-        crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
-        /* Note:  (crc >> 8) MUST zero fill on left */
+{
+    const unsigned char *bin_data = data->buf;
+    Py_ssize_t len = data->len;
+
+    /* Releasing the GIL for very small buffers is inefficient
+       and may lower performance */
+    if (len > 1024*5) {
+        unsigned int result;
+        Py_BEGIN_ALLOW_THREADS
+        result = internal_crc32(bin_data, len, crc);
+        Py_END_ALLOW_THREADS
+        return result;
+    } else {
+        return internal_crc32(bin_data, len, crc);
     }
-
-    result = (crc ^ 0xFFFFFFFF);
-    return result & 0xffffffff;
 }
 #endif  /* USE_ZLIB_CRC32 */
 
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h
index e2a5fccd36c54..71136d31db0e0 100644
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@@ -753,7 +753,7 @@ PyDoc_STRVAR(zlib_crc32__doc__,
 #define ZLIB_CRC32_METHODDEF    \
     {"crc32", (PyCFunction)(void(*)(void))zlib_crc32, METH_FASTCALL, zlib_crc32__doc__},
 
-static PyObject *
+static unsigned int
 zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value);
 
 static PyObject *
@@ -762,6 +762,7 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
     PyObject *return_value = NULL;
     Py_buffer data = {NULL, NULL};
     unsigned int value = 0;
+    unsigned int _return_value;
 
     if (!_PyArg_CheckPositional("crc32", nargs, 1, 2)) {
         goto exit;
@@ -781,7 +782,11 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
         goto exit;
     }
 skip_optional:
-    return_value = zlib_crc32_impl(module, &data, value);
+    _return_value = zlib_crc32_impl(module, &data, value);
+    if ((_return_value == (unsigned int)-1) && PyErr_Occurred()) {
+        goto exit;
+    }
+    return_value = PyLong_FromUnsignedLong((unsigned long)_return_value);
 
 exit:
     /* Cleanup for data */
@@ -815,4 +820,4 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
 #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
     #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
 #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
-/*[clinic end generated code: output=e3e8a6142ea045a7 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=f009d194565416d1 input=a9049054013a1b77]*/
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
index 4cf1b6eeba2f7..2fc39a3bd5620 100644
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -1420,7 +1420,7 @@ zlib_adler32_impl(PyObject *module, Py_buffer *data, unsigned int value)
 }
 
 /*[clinic input]
-zlib.crc32
+zlib.crc32 -> unsigned_int
 
     data: Py_buffer
     value: unsigned_int(bitwise=True) = 0
@@ -1432,9 +1432,9 @@ Compute a CRC-32 checksum of data.
 The returned checksum is an integer.
 [clinic start generated code]*/
 
-static PyObject *
+static unsigned int
 zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
-/*[clinic end generated code: output=63499fa20af7ea25 input=26c3ed430fa00b4c]*/
+/*[clinic end generated code: output=b217562e4fe6d6a6 input=1229cb2fb5ea948a]*/
 {
     /* Releasing the GIL for very small buffers is inefficient
        and may lower performance */
@@ -1455,7 +1455,7 @@ zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
     } else {
         value = crc32(value, data->buf, (unsigned int)data->len);
     }
-    return PyLong_FromUnsignedLong(value & 0xffffffffU);
+    return value;
 }
 
 
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 3686233a1ad50..5e6e703df9123 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -366,7 +366,9 @@
     <ClCompile Include="..\Modules\arraymodule.c" />
     <ClCompile Include="..\Modules\atexitmodule.c" />
     <ClCompile Include="..\Modules\audioop.c" />
-    <ClCompile Include="..\Modules\binascii.c" />
+    <ClCompile Include="..\Modules\binascii.c">
+      <PreprocessorDefinitions>USE_ZLIB_CRC32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
     <ClCompile Include="..\Modules\cmathmodule.c" />
     <ClCompile Include="..\Modules\_datetimemodule.c" />
     <ClCompile Include="..\Modules\errnomodule.c" />



More information about the Python-checkins mailing list