[Python-checkins] bpo-46848: Use stringlib/fastsearch in mmap (GH-31625)

sweeneyde webhook-mailer at python.org
Tue Mar 1 23:46:34 EST 2022


https://github.com/python/cpython/commit/6ddb09f35b922a3bbb59e408a3ca7636a6938468
commit: 6ddb09f35b922a3bbb59e408a3ca7636a6938468
branch: main
author: Dennis Sweeney <36520290+sweeneyde at users.noreply.github.com>
committer: sweeneyde <36520290+sweeneyde at users.noreply.github.com>
date: 2022-03-01T23:46:30-05:00
summary:

bpo-46848: Use stringlib/fastsearch in mmap (GH-31625)

Speed up mmap.find(). Add _PyBytes_Find() and _PyBytes_ReverseFind().

files:
A Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst
M Include/cpython/bytesobject.h
M Modules/mmapmodule.c
M Objects/bytesobject.c

diff --git a/Include/cpython/bytesobject.h b/Include/cpython/bytesobject.h
index 6b3f55224fc55..38a0fe0af660f 100644
--- a/Include/cpython/bytesobject.h
+++ b/Include/cpython/bytesobject.h
@@ -116,3 +116,22 @@ PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer,
     void *str,
     const void *bytes,
     Py_ssize_t size);
+
+/* Substring Search.
+
+   Returns the index of the first occurence of
+   a substring ("needle") in a larger text ("haystack").
+   If the needle is not found, return -1.
+   If the needle is found, add offset to the index.
+*/
+
+PyAPI_FUNC(Py_ssize_t)
+_PyBytes_Find(const char *haystack, Py_ssize_t len_haystack,
+              const char *needle, Py_ssize_t len_needle,
+              Py_ssize_t offset);
+
+/* Same as above, but search right-to-left */
+PyAPI_FUNC(Py_ssize_t)
+_PyBytes_ReverseFind(const char *haystack, Py_ssize_t len_haystack,
+                     const char *needle, Py_ssize_t len_needle,
+                     Py_ssize_t offset);
diff --git a/Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst b/Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst
new file mode 100644
index 0000000000000..bd20a843ab6ce
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst
@@ -0,0 +1,3 @@
+For performance, use the optimized string-searching implementations
+from :meth:`~bytes.find` and :meth:`~bytes.rfind`
+for :meth:`~mmap.find` and :meth:`~mmap.rfind`.
diff --git a/Modules/mmapmodule.c b/Modules/mmapmodule.c
index 26cedf1b9006d..6a038e72f93cf 100644
--- a/Modules/mmapmodule.c
+++ b/Modules/mmapmodule.c
@@ -315,12 +315,8 @@ mmap_gfind(mmap_object *self,
     if (!PyArg_ParseTuple(args, reverse ? "y*|nn:rfind" : "y*|nn:find",
                           &view, &start, &end)) {
         return NULL;
-    } else {
-        const char *p, *start_p, *end_p;
-        int sign = reverse ? -1 : 1;
-        const char *needle = view.buf;
-        Py_ssize_t len = view.len;
-
+    }
+    else {
         if (start < 0)
             start += self->size;
         if (start < 0)
@@ -335,21 +331,19 @@ mmap_gfind(mmap_object *self,
         else if (end > self->size)
             end = self->size;
 
-        start_p = self->data + start;
-        end_p = self->data + end;
-
-        for (p = (reverse ? end_p - len : start_p);
-             (p >= start_p) && (p + len <= end_p); p += sign) {
-            Py_ssize_t i;
-            for (i = 0; i < len && needle[i] == p[i]; ++i)
-                /* nothing */;
-            if (i == len) {
-                PyBuffer_Release(&view);
-                return PyLong_FromSsize_t(p - self->data);
-            }
+        Py_ssize_t res;
+        if (reverse) {
+            res = _PyBytes_ReverseFind(
+                self->data + start, end - start,
+                view.buf, view.len, start);
+        }
+        else {
+            res = _PyBytes_Find(
+                self->data + start, end - start,
+                view.buf, view.len, start);
         }
         PyBuffer_Release(&view);
-        return PyLong_FromLong(-1);
+        return PyLong_FromSsize_t(res);
     }
 }
 
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 3d8a21696d1c8..4c67b8f7af213 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -1247,6 +1247,24 @@ PyBytes_AsStringAndSize(PyObject *obj,
 
 #undef STRINGLIB_GET_EMPTY
 
+Py_ssize_t
+_PyBytes_Find(const char *haystack, Py_ssize_t len_haystack,
+              const char *needle, Py_ssize_t len_needle,
+              Py_ssize_t offset)
+{
+    return stringlib_find(haystack, len_haystack,
+                          needle, len_needle, offset);
+}
+
+Py_ssize_t
+_PyBytes_ReverseFind(const char *haystack, Py_ssize_t len_haystack,
+                     const char *needle, Py_ssize_t len_needle,
+                     Py_ssize_t offset)
+{
+    return stringlib_rfind(haystack, len_haystack,
+                           needle, len_needle, offset);
+}
+
 PyObject *
 PyBytes_Repr(PyObject *obj, int smartquotes)
 {



More information about the Python-checkins mailing list