[Pythoncheckins] bpo34561: Switch to Munro & Wild "powersort" merge strategy. (#28108)
timone
webhookmailer at python.org
Mon Sep 6 13:54:51 EDT 2021
https://github.com/python/cpython/commit/5cb4c672d855033592f0e05162f887def236c00a
commit: 5cb4c672d855033592f0e05162f887def236c00a
branch: main
author: Tim Peters <tim.peters at gmail.com>
committer: timone <tim.peters at gmail.com>
date: 20210906T12:54:4105:00
summary:
bpo34561: Switch to Munro & Wild "powersort" merge strategy. (#28108)
For list.sort(), replace our ad hoc merge ordering strategy with the principled, elegant,
and provably nearoptimal one from Munro and Wild's "powersort".
files:
A Misc/NEWS.d/next/Core and Builtins/20210901192148.bpo34561.uMAVA.rst
M Objects/listobject.c
M Objects/listsort.txt
diff git a/Misc/NEWS.d/next/Core and Builtins/20210901192148.bpo34561.uMAVA.rst b/Misc/NEWS.d/next/Core and Builtins/20210901192148.bpo34561.uMAVA.rst
new file mode 100644
index 0000000000000..7c48cb39df1c5
 /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/20210901192148.bpo34561.uMAVA.rst
@@ 0,0 +1 @@
+List sorting now uses the mergeordering strategy from Munro and Wild's ``powersort()``. Unlike the former strategy, this is provably nearoptimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast lineartime approximations to a problem that's inherently at best quadratictime to solve truly optimally, it's also possible to contrive cases where the former strategy did better.
\ No newline at end of file
diff git a/Objects/listobject.c b/Objects/listobject.c
index 898cbc20c5f81..565c11e7f384f 100644
 a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ 1139,12 +1139,11 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
if (k)
/* The maximum number of entries in a MergeState's pendingruns stack.
 * This is enough to sort arrays of size up to about
 * 32 * phi ** MAX_MERGE_PENDING
 * where phi ~= 1.618. 85 is ridiculouslylarge enough, good for an array
 * with 2**64 elements.
+ * For a list with n elements, this needs at most floor(log2(n)) + 1 entries
+ * even if we didn't force runs to a minimal length. So the number of bits
+ * in a Py_ssize_t is plenty large enough for all cases.
*/
#define MAX_MERGE_PENDING 85
+#define MAX_MERGE_PENDING (SIZEOF_SIZE_T * 8)
/* When we get into galloping mode, we stay there until both runs win less
* often than MIN_GALLOP consecutive times. See listsort.txt for more info.
@@ 1159,7 +1158,8 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
*/
struct s_slice {
sortslice base;
 Py_ssize_t len;
+ Py_ssize_t len; /* length of run */
+ int power; /* node "level" for powersort merge strategy */
};
typedef struct s_MergeState MergeState;
@@ 1170,6 +1170,9 @@ struct s_MergeState {
*/
Py_ssize_t min_gallop;
+ Py_ssize_t listlen; /* len(input_list)  read only */
+ PyObject **basekeys; /* base address of keys array  read only */
+
/* 'a' is temp storage to help with merges. It contains room for
* alloced entries.
*/
@@ 1513,7 +1516,8 @@ gallop_right(MergeState *ms, PyObject *key, PyObject **a, Py_ssize_t n, Py_ssize
/* Conceptually a MergeState's constructor. */
static void
merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
+merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
+ sortslice *lo)
{
assert(ms != NULL);
if (has_keyfunc) {
@@ 1538,6 +1542,8 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
ms>a.keys = ms>temparray;
ms>n = 0;
ms>min_gallop = MIN_GALLOP;
+ ms>listlen = list_size;
+ ms>basekeys = lo>keys;
}
/* Free all the temp memory owned by the MergeState. This must be called
@@ 1920,37 +1926,74 @@ merge_at(MergeState *ms, Py_ssize_t i)
return merge_hi(ms, ssa, na, ssb, nb);
}
/* Examine the stack of runs waiting to be merged, merging adjacent runs
 * until the stack invariants are reestablished:
 *
 * 1. len[3] > len[2] + len[1]
 * 2. len[2] > len[1]
+/* Two adjacent runs begin at index s1. The first run has length n1, and
+ * the second run (starting at index s1+n1) has length n2. The list has total
+ * length n.
+ * Compute the "power" of the first run. See listsort.txt for details.
+ */
+static int
+powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
+{
+ int result = 0;
+ assert(s1 >= 0);
+ assert(n1 > 0 && n2 > 0);
+ assert(s1 + n1 + n2 <= n);
+ /* midpoints a and b:
+ * a = s1 + n1/2
+ * b = s1 + n1 + n2/2 = a + (n1 + n2)/2
+ *
+ * Those may not be integers, though, because of the "/2". So we work with
+ * 2*a and 2*b instead, which are necessarily integers. It makes no
+ * difference to the outcome, since the bits in the expansion of (2*i)/n
+ * are merely shifted one position from those of i/n.
+ */
+ Py_ssize_t a = 2 * s1 + n1; /* 2*a */
+ Py_ssize_t b = a + n1 + n2; /* 2*b */
+ /* Emulate a/n and b/n one bit a time, until bits differ. */
+ for (;;) {
+ ++result;
+ if (a >= n) { /* both quotient bits are 1 */
+ assert(b >= a);
+ a = n;
+ b = n;
+ }
+ else if (b >= n) { /* a/n bit is 0, b/n bit is 1 */
+ break;
+ } /* else both quotient bits are 0 */
+ assert(a < b && b < n);
+ a <<= 1;
+ b <<= 1;
+ }
+ return result;
+}
+
+/* The next run has been identified, of length n2.
+ * If there's already a run on the stack, apply the "powersort" merge strategy:
+ * compute the topmost run's "power" (depth in a conceptual binary merge tree)
+ * and merge adjacent runs on the stack with greater power. See listsort.txt
+ * for more info.
*
 * See listsort.txt for more info.
+ * It's the caller's responsibilty to push the new run on the stack when this
+ * returns.
*
* Returns 0 on success, 1 on error.
*/
static int
merge_collapse(MergeState *ms)
+found_new_run(MergeState *ms, Py_ssize_t n2)
{
 struct s_slice *p = ms>pending;

assert(ms);
 while (ms>n > 1) {
 Py_ssize_t n = ms>n  2;
 if ((n > 0 && p[n1].len <= p[n].len + p[n+1].len) 
 (n > 1 && p[n2].len <= p[n1].len + p[n].len)) {
 if (p[n1].len < p[n+1].len)
 n;
 if (merge_at(ms, n) < 0)
+ if (ms>n) {
+ assert(ms>n > 0);
+ struct s_slice *p = ms>pending;
+ Py_ssize_t s1 = p[ms>n  1].base.keys  ms>basekeys; /* start index */
+ Py_ssize_t n1 = p[ms>n  1].len;
+ int power = powerloop(s1, n1, n2, ms>listlen);
+ while (ms>n > 1 && p[ms>n  2].power > power) {
+ if (merge_at(ms, ms>n  2) < 0)
return 1;
}
 else if (p[n].len <= p[n+1].len) {
 if (merge_at(ms, n) < 0)
 return 1;
 }
 else
 break;
+ assert(ms>n < 2  p[ms>n  2].power < power);
+ p[ms>n  1].power = power;
}
return 0;
}
@@ 2357,7 +2400,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
}
/* End of presort check: ms is now set properly! */
 merge_init(&ms, saved_ob_size, keys != NULL);
+ merge_init(&ms, saved_ob_size, keys != NULL, &lo);
nremaining = saved_ob_size;
if (nremaining < 2)
@@ 2393,13 +2436,16 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
goto fail;
n = force;
}
 /* Push run onto pendingruns stack, and maybe merge. */
+ /* Maybe merge pending runs. */
+ assert(ms.n == 0  ms.pending[ms.n 1].base.keys +
+ ms.pending[ms.n1].len == lo.keys);
+ if (found_new_run(&ms, n) < 0)
+ goto fail;
+ /* Push new run on stack. */
assert(ms.n < MAX_MERGE_PENDING);
ms.pending[ms.n].base = lo;
ms.pending[ms.n].len = n;
++ms.n;
 if (merge_collapse(&ms) < 0)
 goto fail;
/* Advance to find next run. */
sortslice_advance(&lo, n);
nremaining = n;
diff git a/Objects/listsort.txt b/Objects/listsort.txt
index 174777a2658dc..306e5e44d2ecf 100644
 a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ 318,65 +318,104 @@ merging must be done as (A+B)+C or A+(B+C) instead.
So merging is always done on two consecutive runs at a time, and inplace,
although this may require some temp memory (more on that later).
When a run is identified, its base address and length are pushed on a stack
in the MergeState struct. merge_collapse() is then called to potentially
merge runs on that stack. We would like to delay merging as long as possible
in order to exploit patterns that may come up later, but we like even more to
do merging as soon as possible to exploit that the run just found is still
high in the memory hierarchy. We also can't delay merging "too long" because
it consumes memory to remember the runs that are still unmerged, and the
stack has a fixed size.

What turned out to be a good compromise maintains two invariants on the
stack entries, where A, B and C are the lengths of the three rightmost notyet
merged slices:

1. A > B+C
2. B > C

Note that, by induction, #2 implies the lengths of pending runs form a
decreasing sequence. #1 implies that, reading the lengths right to left,
the pendingrun lengths grow at least as fast as the Fibonacci numbers.
Therefore the stack can never grow larger than about log_base_phi(N) entries,
where phi = (1+sqrt(5))/2 ~= 1.618. Thus a small # of stack slots suffice
for very large arrays.

If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the
freshnessincache reason), and the new run replaces the A,B or B,C entries;
e.g., if the last 3 entries are

 A:30 B:20 C:10

then B is merged with C, leaving

 A:30 BC:30

on the stack. Or if they were

 A:500 B:400: C:1000

then A is merged with B, leaving

 AB:900 C:1000

on the stack.

In both examples, the stack configuration after the merge still violates
invariant #2, and merge_collapse() goes on to continue merging runs until
both invariants are satisfied. As an extreme case, suppose we didn't do the
minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2,
and 2. Nothing would get merged until the final 2 was seen, and that would
trigger 7 perfectly balanced merges.

The thrust of these rules when they trigger merging is to balance the run
lengths as closely as possible, while keeping a low bound on the number of
runs we have to remember. This is maximally effective for random data,
where all runs are likely to be of (artificially forced) length minrun, and
then we get a sequence of perfectly balanced merges (with, perhaps, some
oddballs at the end).

OTOH, one reason this sort is so good for partly ordered data has to do
with wildly unbalanced run lengths.
+When a run is identified, its length is passed to found_new_run() to
+potentially merge runs on a stack of pending runs. We would like to delay
+merging as long as possible in order to exploit patterns that may come up
+later, but we like even more to do merging as soon as possible to exploit
+that the run just found is still high in the memory hierarchy. We also can't
+delay merging "too long" because it consumes memory to remember the runs that
+are still unmerged, and the stack has a fixed size.
+
+The original version of this code used the first thing I made up that didn't
+obviously suck ;) It was loosely based on invariants involving the Fibonacci
+sequence.
+
+It worked OK, but it was hard to reason about, and was subtle enough that the
+intended invariants weren't actually preserved. Researchers discovered that
+when trying to complete a computergenerated correctness proof. That was
+easilyenough repaired, but the discovery spurred quite a bit of academic
+interest in truly good ways to manage incremental merging on the fly.
+
+At least a dozen different approaches were developed, some provably having
+nearoptimal worst case behavior with respect to the entropy of the
+distribution of run lengths. Some details can be found in bpo34561.
+
+The code now uses the "powersort" merge strategy from:
+
+ "NearlyOptimal Mergesorts: Fast, Practical Sorting Methods
+ That Optimally Adapt to Existing Runs"
+ J. Ian Munro and Sebastian Wild
+
+The code is pretty simple, but the justification is quite involved, as it's
+based on fast approximations to optimal binary search trees, which are
+substantial topics on their own.
+
+Here we'll just cover some pragmatic details:
+
+The `powerloop()` function computes a run's "power". Say two adjacent runs
+begin at index s1. The first run has length n1, and the second run (starting
+at index s1+n1, called "s2" below) has length n2. The list has total length n.
+The "power" of the first run is a small integer, the depth of the node
+connecting the two runs in an ideal binary merge tree, where power 1 is the
+root node, and the power increases by 1 for each level deeper in the tree.
+
+The power is the least integer L such that the "midpoint interval" contains
+a rational number of the form J/2**L. The midpoint interval is the semi
+closed interval:
+
+ ((s1 + n1/2)/n, (s2 + n2/2)/n]
+
+Yes, that's brainbusting at first ;) Concretely, if (s1 + n1/2)/n and
+(s2 + n2/2)/n are computed to infinite precision in binary, the power L is
+the first position at which the 2**L bit differs between the expansions.
+Since the left end of the interval is less than the right end, the first
+differing bit must be a 0 bit in the left quotient and a 1 bit in the right
+quotient.
+
+`powerloop()` emulates these divisions, 1 bit at a time, using comparisons,
+subtractions, and shifts in a loop.
+
+You'll notice the paper uses an O(1) method instead, but that relies on two
+things we don't have:
+
+ An O(1) "count leading zeroes" primitive. We can find such a thing as a C
+ extension on most platforms, but not all, and there's no uniform spelling
+ on the platforms that support it.
+
+ Integer divison on an integer type twice as wide as needed to hold the
+ list length. But the latter is Py_ssize_t for us, and is typically the
+ widest native signed integer type the platform supports.
+
+But since runs in our algorithm are almost never very short, the onceperrun
+overhead of `powerloop()` seems lost in the noise.
+
+Detail: why is Py_ssize_t "wide enough" in `powerloop()`? We do, after all,
+shift integers of that width left by 1. How do we know that won't spill into
+the sign bit? The trick is that we have some slop. `n` (the total list
+length) is the number of list elements, which is at most 4 times (on a 32box,
+with 4byte pointers) smaller than than the largest size_t. So at least the
+leading two bits of the integers we're using are clear.
+
+Since we can't compute a run's power before seeing the run that follows it,
+the mostrecently identified run is never merged by `found_new_run()`.
+Instead a new run is only used to compute the 2ndmostrecent run's power.
+Then adjacent runs are merged so long as their saved power (tree depth) is
+greater than that newly computed power. When found_new_run() returns, only
+then is a new run pushed on to the stack of pending runs.
+
+A key invariant is that powers on the run stack are strictly decreasing
+(starting from the run at the top of the stack).
+
+Note that even powersort's strategy isn't always truly optimal. It can't be.
+Computing an optimal merge sequence can be done in time quadratic in the
+number of runs, which is very much slower, and also requires finding &
+remembering _all_ the runs' lengths (of which there may be billions) in
+advance. It's remarkable, though, how close to optimal this strategy gets.
+
+Curious factoid: of all the alternatives I've seen in the literature,
+powersort's is the only one that's always truly optimal for a collection of 3
+run lengths (for three lengths A B C, it's always optimal to first merge the
+shorter of A and C with B).
Merge Memory
More information about the Pythoncheckins
mailing list