[Python-checkins] gh-91146: More reduce allocation size of list from str.split/rsplit (gh-95493)

corona10 webhook-mailer at python.org
Mon Aug 1 09:15:12 EDT 2022


https://github.com/python/cpython/commit/fb75d015f487e50079e8d2ea7859750684b124e4
commit: fb75d015f487e50079e8d2ea7859750684b124e4
branch: main
author: Dong-hee Na <donghee.na at python.org>
committer: corona10 <donghee.na92 at gmail.com>
date: 2022-08-01T22:15:07+09:00
summary:

gh-91146: More reduce allocation size of list from str.split/rsplit (gh-95493)


Co-authored-by: Inada Naoki <songofacandy at gmail.com>

files:
M Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst
M Objects/unicodeobject.c

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst
index 52568dbedd130..9172ca298e809 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst	
@@ -1,2 +1,2 @@
 Reduce allocation size of :class:`list` from :meth:`str.split`
-and :meth:`str.rsplit`. Patch by Dong-hee Na.
+and :meth:`str.rsplit`. Patch by Dong-hee Na and Inada Naoki.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 355d74fe3bbda..7ff79953257ee 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -9698,11 +9698,11 @@ split(PyObject *self,
     PyObject* out;
     len1 = PyUnicode_GET_LENGTH(self);
     kind1 = PyUnicode_KIND(self);
-    if (maxcount < 0) {
-        maxcount = len1;
-    }
 
-    if (substring == NULL)
+    if (substring == NULL) {
+        if (maxcount < 0) {
+            maxcount = (len1 - 1) / 2 + 1;
+        }
         switch (kind1) {
         case PyUnicode_1BYTE_KIND:
             if (PyUnicode_IS_ASCII(self))
@@ -9728,9 +9728,16 @@ split(PyObject *self,
         default:
             Py_UNREACHABLE();
         }
+    }
 
     kind2 = PyUnicode_KIND(substring);
     len2 = PyUnicode_GET_LENGTH(substring);
+    if (maxcount < 0) {
+        // if len2 == 0, it will raise ValueError.
+        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
+        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
+        maxcount = maxcount < 0 ? len1 : maxcount;
+    }
     if (kind1 < kind2 || len1 < len2) {
         out = PyList_New(1);
         if (out == NULL)
@@ -9785,11 +9792,11 @@ rsplit(PyObject *self,
 
     len1 = PyUnicode_GET_LENGTH(self);
     kind1 = PyUnicode_KIND(self);
-    if (maxcount < 0) {
-        maxcount = len1;
-    }
 
-    if (substring == NULL)
+    if (substring == NULL) {
+        if (maxcount < 0) {
+            maxcount = (len1 - 1) / 2 + 1;
+        }
         switch (kind1) {
         case PyUnicode_1BYTE_KIND:
             if (PyUnicode_IS_ASCII(self))
@@ -9815,9 +9822,15 @@ rsplit(PyObject *self,
         default:
             Py_UNREACHABLE();
         }
-
+    }
     kind2 = PyUnicode_KIND(substring);
     len2 = PyUnicode_GET_LENGTH(substring);
+    if (maxcount < 0) {
+        // if len2 == 0, it will raise ValueError.
+        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
+        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
+        maxcount = maxcount < 0 ? len1 : maxcount;
+    }
     if (kind1 < kind2 || len1 < len2) {
         out = PyList_New(1);
         if (out == NULL)



More information about the Python-checkins mailing list