[Python-checkins] bpo-41930: Add support for SQLite serialise/deserialise API (GH-26728)

JelleZijlstra webhook-mailer at python.org
Tue Apr 5 10:15:35 EDT 2022


https://github.com/python/cpython/commit/a7551247e7cb7010fb4735281f1afa4abeb8a9cc
commit: a7551247e7cb7010fb4735281f1afa4abeb8a9cc
branch: main
author: Erlend Egeberg Aasland <erlend.aasland at innova.no>
committer: JelleZijlstra <jelle.zijlstra at gmail.com>
date: 2022-04-05T07:15:25-07:00
summary:

bpo-41930: Add support for SQLite serialise/deserialise API (GH-26728)

Co-authored-by: Jelle Zijlstra <jelle.zijlstra at gmail.com>
Co-authored-by: Kumar Aditya <59607654+kumaraditya303 at users.noreply.github.com>

files:
A Misc/NEWS.d/next/Library/2021-06-17-00-02-58.bpo-41930.JS6fsd.rst
M Doc/library/sqlite3.rst
M Doc/whatsnew/3.11.rst
M Lib/test/test_sqlite3/test_dbapi.py
M Modules/_sqlite/clinic/connection.c.h
M Modules/_sqlite/connection.c
M PCbuild/_sqlite3.vcxproj
M configure
M configure.ac
M pyconfig.h.in

diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst
index e70d038e61d82..852b68437a265 100644
--- a/Doc/library/sqlite3.rst
+++ b/Doc/library/sqlite3.rst
@@ -748,6 +748,44 @@ Connection Objects
       .. versionadded:: 3.11
 
 
+   .. method:: serialize(*, name="main")
+
+      This method serializes a database into a :class:`bytes` object.  For an
+      ordinary on-disk database file, the serialization is just a copy of the
+      disk file.  For an in-memory database or a "temp" database, the
+      serialization is the same sequence of bytes which would be written to
+      disk if that database were backed up to disk.
+
+      *name* is the database to be serialized, and defaults to the main
+      database.
+
+      .. note::
+
+         This method is only available if the underlying SQLite library has the
+         serialize API.
+
+      .. versionadded:: 3.11
+
+
+   .. method:: deserialize(data, /, *, name="main")
+
+      This method causes the database connection to disconnect from database
+      *name*, and reopen *name* as an in-memory database based on the
+      serialization contained in *data*.  Deserialization will raise
+      :exc:`OperationalError` if the database connection is currently involved
+      in a read transaction or a backup operation.  :exc:`DataError` will be
+      raised if ``len(data)`` is larger than ``2**63 - 1``, and
+      :exc:`DatabaseError` will be raised if *data* does not contain a valid
+      SQLite database.
+
+      .. note::
+
+         This method is only available if the underlying SQLite library has the
+         deserialize API.
+
+      .. versionadded:: 3.11
+
+
 .. _sqlite3-cursor-objects:
 
 Cursor Objects
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index d0c10a9100997..c312645c31cd7 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -366,6 +366,11 @@ sqlite3
   Instead we leave it to the SQLite library to handle these cases.
   (Contributed by Erlend E. Aasland in :issue:`44092`.)
 
+* Add :meth:`~sqlite3.Connection.serialize` and
+  :meth:`~sqlite3.Connection.deserialize` to :class:`sqlite3.Connection` for
+  serializing and deserializing databases.
+  (Contributed by Erlend E. Aasland in :issue:`41930`.)
+
 
 sys
 ---
diff --git a/Lib/test/test_sqlite3/test_dbapi.py b/Lib/test/test_sqlite3/test_dbapi.py
index 177c2cd327ff3..02482816cb933 100644
--- a/Lib/test/test_sqlite3/test_dbapi.py
+++ b/Lib/test/test_sqlite3/test_dbapi.py
@@ -29,6 +29,7 @@
 
 from test.support import (
     SHORT_TIMEOUT,
+    bigmemtest,
     check_disallow_instantiation,
     threading_helper,
 )
@@ -603,6 +604,56 @@ def test_uninit_operations(self):
                                        func)
 
 
+ at unittest.skipUnless(hasattr(sqlite.Connection, "serialize"),
+                     "Needs SQLite serialize API")
+class SerializeTests(unittest.TestCase):
+    def test_serialize_deserialize(self):
+        with memory_database() as cx:
+            with cx:
+                cx.execute("create table t(t)")
+            data = cx.serialize()
+            self.assertEqual(len(data), 8192)
+
+            # Remove test table, verify that it was removed.
+            with cx:
+                cx.execute("drop table t")
+            regex = "no such table"
+            with self.assertRaisesRegex(sqlite.OperationalError, regex):
+                cx.execute("select t from t")
+
+            # Deserialize and verify that test table is restored.
+            cx.deserialize(data)
+            cx.execute("select t from t")
+
+    def test_deserialize_wrong_args(self):
+        dataset = (
+            (BufferError, memoryview(b"blob")[::2]),
+            (TypeError, []),
+            (TypeError, 1),
+            (TypeError, None),
+        )
+        for exc, arg in dataset:
+            with self.subTest(exc=exc, arg=arg):
+                with memory_database() as cx:
+                    self.assertRaises(exc, cx.deserialize, arg)
+
+    def test_deserialize_corrupt_database(self):
+        with memory_database() as cx:
+            regex = "file is not a database"
+            with self.assertRaisesRegex(sqlite.DatabaseError, regex):
+                cx.deserialize(b"\0\1\3")
+                # SQLite does not generate an error until you try to query the
+                # deserialized database.
+                cx.execute("create table fail(f)")
+
+    @unittest.skipUnless(sys.maxsize > 2**32, 'requires 64bit platform')
+    @bigmemtest(size=2**63, memuse=3, dry_run=False)
+    def test_deserialize_too_much_data_64bit(self):
+        with memory_database() as cx:
+            with self.assertRaisesRegex(OverflowError, "'data' is too large"):
+                cx.deserialize(b"b" * size)
+
+
 class OpenTests(unittest.TestCase):
     _sql = "create table test(id integer)"
 
@@ -1030,6 +1081,10 @@ def test_check_connection_thread(self):
             lambda: self.con.setlimit(sqlite.SQLITE_LIMIT_LENGTH, -1),
             lambda: self.con.getlimit(sqlite.SQLITE_LIMIT_LENGTH),
         ]
+        if hasattr(sqlite.Connection, "serialize"):
+            fns.append(lambda: self.con.serialize())
+            fns.append(lambda: self.con.deserialize(b""))
+
         for fn in fns:
             with self.subTest(fn=fn):
                 self._run_test(fn)
diff --git a/Misc/NEWS.d/next/Library/2021-06-17-00-02-58.bpo-41930.JS6fsd.rst b/Misc/NEWS.d/next/Library/2021-06-17-00-02-58.bpo-41930.JS6fsd.rst
new file mode 100644
index 0000000000000..ce494e7225e22
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-06-17-00-02-58.bpo-41930.JS6fsd.rst
@@ -0,0 +1,3 @@
+Add :meth:`~sqlite3.Connection.serialize` and
+:meth:`~sqlite3.Connection.deserialize` support to :mod:`sqlite3`. Patch by
+Erlend E. Aasland.
diff --git a/Modules/_sqlite/clinic/connection.c.h b/Modules/_sqlite/clinic/connection.c.h
index 111e344fd2ae1..99ef94ecd71ec 100644
--- a/Modules/_sqlite/clinic/connection.c.h
+++ b/Modules/_sqlite/clinic/connection.c.h
@@ -693,6 +693,156 @@ pysqlite_connection_create_collation(pysqlite_Connection *self, PyTypeObject *cl
     return return_value;
 }
 
+#if defined(PY_SQLITE_HAVE_SERIALIZE)
+
+PyDoc_STRVAR(serialize__doc__,
+"serialize($self, /, *, name=\'main\')\n"
+"--\n"
+"\n"
+"Serialize a database into a byte string.\n"
+"\n"
+"  name\n"
+"    Which database to serialize.\n"
+"\n"
+"For an ordinary on-disk database file, the serialization is just a copy of the\n"
+"disk file. For an in-memory database or a \"temp\" database, the serialization is\n"
+"the same sequence of bytes which would be written to disk if that database\n"
+"were backed up to disk.");
+
+#define SERIALIZE_METHODDEF    \
+    {"serialize", (PyCFunction)(void(*)(void))serialize, METH_FASTCALL|METH_KEYWORDS, serialize__doc__},
+
+static PyObject *
+serialize_impl(pysqlite_Connection *self, const char *name);
+
+static PyObject *
+serialize(pysqlite_Connection *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    static const char * const _keywords[] = {"name", NULL};
+    static _PyArg_Parser _parser = {NULL, _keywords, "serialize", 0};
+    PyObject *argsbuf[1];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    const char *name = "main";
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 0, 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    if (!PyUnicode_Check(args[0])) {
+        _PyArg_BadArgument("serialize", "argument 'name'", "str", args[0]);
+        goto exit;
+    }
+    Py_ssize_t name_length;
+    name = PyUnicode_AsUTF8AndSize(args[0], &name_length);
+    if (name == NULL) {
+        goto exit;
+    }
+    if (strlen(name) != (size_t)name_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = serialize_impl(self, name);
+
+exit:
+    return return_value;
+}
+
+#endif /* defined(PY_SQLITE_HAVE_SERIALIZE) */
+
+#if defined(PY_SQLITE_HAVE_SERIALIZE)
+
+PyDoc_STRVAR(deserialize__doc__,
+"deserialize($self, data, /, *, name=\'main\')\n"
+"--\n"
+"\n"
+"Load a serialized database.\n"
+"\n"
+"  data\n"
+"    The serialized database content.\n"
+"  name\n"
+"    Which database to reopen with the deserialization.\n"
+"\n"
+"The deserialize interface causes the database connection to disconnect from the\n"
+"target database, and then reopen it as an in-memory database based on the given\n"
+"serialized data.\n"
+"\n"
+"The deserialize interface will fail with SQLITE_BUSY if the database is\n"
+"currently in a read transaction or is involved in a backup operation.");
+
+#define DESERIALIZE_METHODDEF    \
+    {"deserialize", (PyCFunction)(void(*)(void))deserialize, METH_FASTCALL|METH_KEYWORDS, deserialize__doc__},
+
+static PyObject *
+deserialize_impl(pysqlite_Connection *self, Py_buffer *data,
+                 const char *name);
+
+static PyObject *
+deserialize(pysqlite_Connection *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    static const char * const _keywords[] = {"", "name", NULL};
+    static _PyArg_Parser _parser = {NULL, _keywords, "deserialize", 0};
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    Py_buffer data = {NULL, NULL};
+    const char *name = "main";
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (PyUnicode_Check(args[0])) {
+        Py_ssize_t len;
+        const char *ptr = PyUnicode_AsUTF8AndSize(args[0], &len);
+        if (ptr == NULL) {
+            goto exit;
+        }
+        PyBuffer_FillInfo(&data, args[0], (void *)ptr, len, 1, 0);
+    }
+    else { /* any bytes-like object */
+        if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) {
+            goto exit;
+        }
+        if (!PyBuffer_IsContiguous(&data, 'C')) {
+            _PyArg_BadArgument("deserialize", "argument 1", "contiguous buffer", args[0]);
+            goto exit;
+        }
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    if (!PyUnicode_Check(args[1])) {
+        _PyArg_BadArgument("deserialize", "argument 'name'", "str", args[1]);
+        goto exit;
+    }
+    Py_ssize_t name_length;
+    name = PyUnicode_AsUTF8AndSize(args[1], &name_length);
+    if (name == NULL) {
+        goto exit;
+    }
+    if (strlen(name) != (size_t)name_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = deserialize_impl(self, &data, name);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj) {
+       PyBuffer_Release(&data);
+    }
+
+    return return_value;
+}
+
+#endif /* defined(PY_SQLITE_HAVE_SERIALIZE) */
+
 PyDoc_STRVAR(pysqlite_connection_enter__doc__,
 "__enter__($self, /)\n"
 "--\n"
@@ -832,4 +982,12 @@ getlimit(pysqlite_Connection *self, PyObject *arg)
 #ifndef PYSQLITE_CONNECTION_LOAD_EXTENSION_METHODDEF
     #define PYSQLITE_CONNECTION_LOAD_EXTENSION_METHODDEF
 #endif /* !defined(PYSQLITE_CONNECTION_LOAD_EXTENSION_METHODDEF) */
-/*[clinic end generated code: output=176c9095219b17c4 input=a9049054013a1b77]*/
+
+#ifndef SERIALIZE_METHODDEF
+    #define SERIALIZE_METHODDEF
+#endif /* !defined(SERIALIZE_METHODDEF) */
+
+#ifndef DESERIALIZE_METHODDEF
+    #define DESERIALIZE_METHODDEF
+#endif /* !defined(DESERIALIZE_METHODDEF) */
+/*[clinic end generated code: output=d965a68f9229a56c input=a9049054013a1b77]*/
diff --git a/Modules/_sqlite/connection.c b/Modules/_sqlite/connection.c
index 37f6d0fa5a502..9d187cfa99d23 100644
--- a/Modules/_sqlite/connection.c
+++ b/Modules/_sqlite/connection.c
@@ -1818,6 +1818,125 @@ pysqlite_connection_create_collation_impl(pysqlite_Connection *self,
     Py_RETURN_NONE;
 }
 
+#ifdef PY_SQLITE_HAVE_SERIALIZE
+/*[clinic input]
+_sqlite3.Connection.serialize as serialize
+
+    *
+    name: str = "main"
+        Which database to serialize.
+
+Serialize a database into a byte string.
+
+For an ordinary on-disk database file, the serialization is just a copy of the
+disk file. For an in-memory database or a "temp" database, the serialization is
+the same sequence of bytes which would be written to disk if that database
+were backed up to disk.
+[clinic start generated code]*/
+
+static PyObject *
+serialize_impl(pysqlite_Connection *self, const char *name)
+/*[clinic end generated code: output=97342b0e55239dd3 input=d2eb5194a65abe2b]*/
+{
+    if (!pysqlite_check_thread(self) || !pysqlite_check_connection(self)) {
+        return NULL;
+    }
+
+    /* If SQLite has a contiguous memory representation of the database, we can
+     * avoid memory allocations, so we try with the no-copy flag first.
+     */
+    sqlite3_int64 size;
+    unsigned int flags = SQLITE_SERIALIZE_NOCOPY;
+    const char *data;
+
+    Py_BEGIN_ALLOW_THREADS
+    data = (const char *)sqlite3_serialize(self->db, name, &size, flags);
+    if (data == NULL) {
+        flags &= ~SQLITE_SERIALIZE_NOCOPY;
+        data = (const char *)sqlite3_serialize(self->db, name, &size, flags);
+    }
+    Py_END_ALLOW_THREADS
+
+    if (data == NULL) {
+        PyErr_Format(self->OperationalError, "unable to serialize '%s'",
+                     name);
+        return NULL;
+    }
+    PyObject *res = PyBytes_FromStringAndSize(data, size);
+    if (!(flags & SQLITE_SERIALIZE_NOCOPY)) {
+        sqlite3_free((void *)data);
+    }
+    return res;
+}
+
+/*[clinic input]
+_sqlite3.Connection.deserialize as deserialize
+
+    data: Py_buffer(accept={buffer, str})
+        The serialized database content.
+    /
+    *
+    name: str = "main"
+        Which database to reopen with the deserialization.
+
+Load a serialized database.
+
+The deserialize interface causes the database connection to disconnect from the
+target database, and then reopen it as an in-memory database based on the given
+serialized data.
+
+The deserialize interface will fail with SQLITE_BUSY if the database is
+currently in a read transaction or is involved in a backup operation.
+[clinic start generated code]*/
+
+static PyObject *
+deserialize_impl(pysqlite_Connection *self, Py_buffer *data,
+                 const char *name)
+/*[clinic end generated code: output=e394c798b98bad89 input=1be4ca1faacf28f2]*/
+{
+    if (!pysqlite_check_thread(self) || !pysqlite_check_connection(self)) {
+        return NULL;
+    }
+
+    /* Transfer ownership of the buffer to SQLite:
+     * - Move buffer from Py to SQLite
+     * - Tell SQLite to free buffer memory
+     * - Tell SQLite that it is permitted to grow the resulting database
+     *
+     * Make sure we don't overflow sqlite3_deserialize(); it accepts a signed
+     * 64-bit int as its data size argument.
+     *
+     * We can safely use sqlite3_malloc64 here, since it was introduced before
+     * the serialize APIs.
+     */
+    if (data->len > 9223372036854775807) {  // (1 << 63) - 1
+        PyErr_SetString(PyExc_OverflowError, "'data' is too large");
+        return NULL;
+    }
+
+    sqlite3_int64 size = (sqlite3_int64)data->len;
+    unsigned char *buf = sqlite3_malloc64(size);
+    if (buf == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    const unsigned int flags = SQLITE_DESERIALIZE_FREEONCLOSE |
+                               SQLITE_DESERIALIZE_RESIZEABLE;
+    int rc;
+    Py_BEGIN_ALLOW_THREADS
+    (void)memcpy(buf, data->buf, data->len);
+    rc = sqlite3_deserialize(self->db, name, buf, size, size, flags);
+    Py_END_ALLOW_THREADS
+
+    if (rc != SQLITE_OK) {
+        (void)_pysqlite_seterror(self->state, self->db);
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+#endif  // PY_SQLITE_HAVE_SERIALIZE
+
+
 /*[clinic input]
 _sqlite3.Connection.__enter__ as pysqlite_connection_enter
 
@@ -1971,6 +2090,8 @@ static PyMethodDef connection_methods[] = {
     PYSQLITE_CONNECTION_SET_TRACE_CALLBACK_METHODDEF
     SETLIMIT_METHODDEF
     GETLIMIT_METHODDEF
+    SERIALIZE_METHODDEF
+    DESERIALIZE_METHODDEF
     {NULL, NULL}
 };
 
diff --git a/PCbuild/_sqlite3.vcxproj b/PCbuild/_sqlite3.vcxproj
index e268c473f4c98..9cff43f73e5be 100644
--- a/PCbuild/_sqlite3.vcxproj
+++ b/PCbuild/_sqlite3.vcxproj
@@ -94,6 +94,7 @@
   <ItemDefinitionGroup>
     <ClCompile>
       <AdditionalIncludeDirectories>$(sqlite3Dir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>PY_SQLITE_HAVE_SERIALIZE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
diff --git a/configure b/configure
index 44912b9c34df8..69b12309de578 100755
--- a/configure
+++ b/configure
@@ -12902,6 +12902,50 @@ if test "x$ac_cv_lib_sqlite3_sqlite3_load_extension" = xyes; then :
 else
   have_sqlite3_load_extension=no
 
+fi
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sqlite3_serialize in -lsqlite3" >&5
+$as_echo_n "checking for sqlite3_serialize in -lsqlite3... " >&6; }
+if ${ac_cv_lib_sqlite3_sqlite3_serialize+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lsqlite3  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char sqlite3_serialize ();
+int
+main ()
+{
+return sqlite3_serialize ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_sqlite3_sqlite3_serialize=yes
+else
+  ac_cv_lib_sqlite3_sqlite3_serialize=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_sqlite3_sqlite3_serialize" >&5
+$as_echo "$ac_cv_lib_sqlite3_sqlite3_serialize" >&6; }
+if test "x$ac_cv_lib_sqlite3_sqlite3_serialize" = xyes; then :
+
+
+$as_echo "#define PY_SQLITE_HAVE_SERIALIZE 1" >>confdefs.h
+
+
 fi
 
 
diff --git a/configure.ac b/configure.ac
index c02adf7bf3f14..5860595b752c8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3605,6 +3605,12 @@ dnl hence CPPFLAGS instead of CFLAGS.
         [have_sqlite3_load_extension=yes],
         [have_sqlite3_load_extension=no]
       )
+      AC_CHECK_LIB([sqlite3], [sqlite3_serialize], [
+        AC_DEFINE(
+          [PY_SQLITE_HAVE_SERIALIZE], [1],
+          [Define if SQLite was compiled with the serialize API]
+        )
+      ])
     ], [
       have_supported_sqlite3=no
     ])
diff --git a/pyconfig.h.in b/pyconfig.h.in
index be776f734163f..4ac054a28d8ef 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -1506,6 +1506,9 @@
 /* Define to 1 to build the sqlite module with loadable extensions support. */
 #undef PY_SQLITE_ENABLE_LOAD_EXTENSION
 
+/* Define if SQLite was compiled with the serialize API */
+#undef PY_SQLITE_HAVE_SERIALIZE
+
 /* Default cipher suites list for ssl module. 1: Python's preferred selection,
    2: leave OpenSSL defaults untouched, 0: custom string */
 #undef PY_SSL_DEFAULT_CIPHERS



More information about the Python-checkins mailing list