From 0bd2fd757913e6519e3b3a1b05bfcaba927ca21e Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sun, 15 Jun 2025 22:46:35 -0500
Subject: [PATCH 1/5] Initial stab at implementing Stefan Pochmann's spiffy new
 minrun scheme.

---
 Misc/ACKS            |   1 +
 Objects/listobject.c |  51 +++++++++---------
 Objects/listsort.txt | 126 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 137 insertions(+), 41 deletions(-)

diff --git a/Misc/ACKS b/Misc/ACKS
index d4557a03eb5400..f6cc0b54cbed4e 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1480,6 +1480,7 @@ Jean-François Piéronne
 Oleg Plakhotnyuk
 Anatoliy Platonov
 Marcel Plch
+Stefan Pochmann
 Kirill Podoprigora
 Remi Pointel
 Jon Poler
diff --git a/Objects/listobject.c b/Objects/listobject.c
index c5895645a2dd12..49683e5e005532 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1684,10 +1684,7 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
 /* Avoid malloc for small temp arrays. */
 #define MERGESTATE_TEMP_SIZE 256
 
-/* The largest value of minrun. This must be a power of 2, and >= 1, so that
- * the compute_minrun() algorithm guarantees to return a result no larger than
- * this,
- */
+/* The largest value of minrun. This must be a power of 2, and >= 1 */
 #define MAX_MINRUN 64
 #if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
 #error "MAX_MINRUN must be a power of 2, and >= 1"
@@ -1748,6 +1745,12 @@ struct s_MergeState {
      * of tuples. It may be set to safe_object_compare, but the idea is that hopefully
      * we can assume more, and use one of the special-case compares. */
     int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
+
+    /* Varisbles used for minrun computation. The "ideal" minrun length is
+     * the infinite precision listlen / 2**e, which is represented as the
+     * marhematical value of mr_int + mr_frac / 2**e.
+     */
+     Py_ssize_t mr_int, mr_frac, mr_current_frac, mr_e, mr_mask;
 };
 
 /* binarysort is the best method for sorting small arrays: it does few
@@ -2209,6 +2212,16 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
     ms->min_gallop = MIN_GALLOP;
     ms->listlen = list_size;
     ms->basekeys = lo->keys;
+
+    ms->mr_int = list_size;
+    ms->mr_e = 0;
+    while (ms->mr_int >= MAX_MINRUN) {
+        ms->mr_int >>= 1;
+        ++ms->mr_e;
+    }
+    ms->mr_mask = (1 << ms->mr_e) - 1;
+    ms->mr_frac = list_size & ms->mr_mask;
+    ms->mr_current_frac = 0;
 }
 
 /* Free all the temp memory owned by the MergeState.  This must be called
@@ -2686,27 +2699,15 @@ merge_force_collapse(MergeState *ms)
     return 0;
 }
 
-/* Compute a good value for the minimum run length; natural runs shorter
- * than this are boosted artificially via binary insertion.
- *
- * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
- * Else if n is an exact power of 2, return MAX_MINRUN / 2.
- * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is
- * close to, but strictly less than, an exact power of 2.
- *
- * See listsort.txt for more info.
- */
-static Py_ssize_t
-merge_compute_minrun(Py_ssize_t n)
+/* Return the next minrun value to use. See listsort.txt. */
+static inline Py_ssize_t
+minrun_next(MergeState *ms)
 {
-    Py_ssize_t r = 0;           /* becomes 1 if any 1 bits are shifted off */
-
-    assert(n >= 0);
-    while (n >= MAX_MINRUN) {
-        r |= n & 1;
-        n >>= 1;
-    }
-    return n + r;
+    ms->mr_current_frac += ms->mr_frac;
+    assert(ms->mr_current_frac >> ms->mr_e <= 1);
+    Py_ssize_t result = ms->mr_int + (ms->mr_current_frac >> ms->mr_e);
+    ms->mr_current_frac &= ms->mr_mask;
+    return result;
 }
 
 /* Here we define custom comparison functions to optimize for the cases one commonly
@@ -3074,7 +3075,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
     /* March over the array once, left to right, finding natural runs,
      * and extending short natural runs to minrun elements.
      */
-    minrun = merge_compute_minrun(nremaining);
     do {
         Py_ssize_t n;
 
@@ -3083,6 +3083,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         if (n < 0)
             goto fail;
         /* If short, extend to min(minrun, nremaining). */
+        minrun = minrun_next(&ms);
         if (n < minrun) {
             const Py_ssize_t force = nremaining <= minrun ?
                               nremaining : minrun;
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index f387d9c116e502..2d62606f3461b4 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -270,8 +270,8 @@ result.  This has two primary good effects:
 
 Computing minrun
 ----------------
-If N < MAX_MINRUN, minrun is N.  IOW, binary insertion sort is used for the 
-whole array then; it's hard to beat that given the overheads of trying 
+If N < MAX_MINRUN, minrun is N.  IOW, binary insertion sort is used for the
+whole array then; it's hard to beat that given the overheads of trying
 something fancier (see note BINSORT).
 
 When N is a power of 2, testing on random data showed that minrun values of
@@ -288,7 +288,6 @@ that 32 isn't a good choice for the general case!  Consider N=2112:
 
 >>> divmod(2112, 32)
 (66, 0)
->>>
 
 If the data is randomly ordered, we're very likely to end up with 66 runs
 each of length 32.  The first 64 of these trigger a sequence of perfectly
@@ -301,22 +300,40 @@ to get 64 elements into place).
 If we take minrun=33 in this case, then we're very likely to end up with 64
 runs each of length 33, and then all merges are perfectly balanced.  Better!
 
-What we want to avoid is picking minrun such that in
+The original code used a cheap heuristic to pick a minrun that avoided the
+very worst cases of imbalance for the final merge, but "pretty bad" cases
+still existed.
 
-    q, r = divmod(N, minrun)
+In 2025, Stefan Pochmann found a much better approach, based on letting minrun
+vary a bit from one run to the next. Under his scheme, at _all_ levels of the
+merge tree:
 
-q is a power of 2 and r>0 (then the last merge only gets r elements into
-place, and r < minrun is small compared to N), or q a little larger than a
-power of 2 regardless of r (then we've got a case similar to "2112", again
-leaving too little work for the last merge to do).
+- The number of runs is a power of 2.
+- At most two different run lengths appear.
+- When two do appear, the smaller is one less than the larger.
+- The lengths of run pairs merged never differ by more than one.
 
-Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that 
-N/minrun is exactly a power of 2, or if that isn't possible, is close to, but 
-strictly less than, a power of 2.  This is easier to do than it may sound: 
-take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining 
-bits are set. In fact, that rule covers every case in this section, including 
-small N and exact powers of 2; merge_compute_minrun() is a deceptively simple 
-function.
+So, in all respects, as perfectly balanced as possible.
+
+For the 2112 case, that also keeps minrun at 33, but we were lucky there
+that 2112 is a power of 2 times 33. The new approach doesn't rely on luck.
+
+The basic idea is to conceive of the ideal run length as being a real number
+rather than just an integer. For an array of length `n`, let `e` be the
+smallest int such that n/2**e < MAX_MINRUN. Then mr = n/2**e is the ideal
+run length, and obviously mr * 2**e is n, so there are exactly 2**e runs.
+
+Of course runs can't have a fractional length, so we start the i'th (zero-
+based) run at index int(mr * i), for i in range(2**e). The differences between
+adjacent starting indices are the run lengths, and it's left as an exercise
+for the reader to show that they have the nice properties listed above. See
+note MINRUN CODE for an executable Python implementation to help make it all
+concrete.
+
+The code doesn't actually compute the starting indices, or use floats. Instead
+mr is represented as a pair of integers such that the infinite precision mr is
+equal to mr_int + mr_frac / 2**e, and only the delta (run length) from one
+index to the next is computed.
 
 
 The Merge Pattern
@@ -820,3 +837,80 @@ partially mitigated by pre-scanning the data to determine whether the data is
 homogeneous with respect to type.  If so, it is sometimes possible to
 substitute faster type-specific comparisons for the slower, generic
 PyObject_RichCompareBool.
+
+MINRUN CODE
+from itertools import accumulate
+try:
+    from itertools import batched
+except ImportError:
+    from itertools import islice
+    def batched(xs, k):
+        it = iter(xs)
+        while chunk := tuple(islice(it, k)):
+            yield chunk
+
+MAX_MINRUN = 64
+
+def gen_minruns(n):
+    # mr_int = minrun's integral part
+    # mr_frac = minrun's fractional part with mr_e bits and
+    # mask mr_mask
+    mr_int = n
+    mr_e = 0
+    while mr_int >= MAX_MINRUN:
+        mr_int >>= 1
+        mr_e += 1
+    mr_mask = (1 << mr_e) - 1
+    mr_frac = n & mr_mask
+
+    mr_current_frac = 0
+    while True:
+        mr_current_frac += mr_frac
+        assert mr_current_frac >> mr_e <= 1
+        yield mr_int + (mr_current_frac >> mr_e)
+        mr_current_frac &= mr_mask
+
+def chew(n, show=False):
+    if n < 1:
+        return
+
+    sizes = []
+    tot = 0
+    for size in gen_minruns(n):
+        sizes.append(size)
+        tot += size
+        if tot >= n:
+            break
+    assert tot == n
+    print(n, len(sizes))
+
+    small, large = 32, 64
+    while len(sizes) > 1:
+        assert not len(sizes) & 1
+        assert len(sizes).bit_count() == 1 # i.e., power of 2
+        assert sum(sizes) == n
+        assert min(sizes) >= min(n, small)
+        assert max(sizes) <= large
+
+        d = set(sizes)
+        assert len(d) <= 2
+        if len(d) == 2:
+            lo, hi = sorted(d)
+            assert lo + 1 == hi
+
+        mr = n / len(sizes)
+        for i, s in enumerate(accumulate(sizes, initial=0)):
+            assert int(mr * i) == s
+
+        newsizes = []
+        for a, b in batched(sizes, 2):
+            assert abs(a - b) <= 1
+            newsizes.append(a + b)
+        sizes = newsizes
+        smsll = large
+        large *= 2
+
+    assert sizes[0] == n
+
+for n in range(2_000_001):
+    chew(n)

From b4e4451c46a24c44521bc2988ac5170632345166 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Mon, 16 Jun 2025 03:56:20 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst               | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst
new file mode 100644
index 00000000000000..7d0fb033af0299
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst
@@ -0,0 +1 @@
+Sorting randomly ordered arrays will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible.

From 835190d7d6aa6a11ce966c6a29c3d68cf5149c7e Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sun, 15 Jun 2025 23:00:50 -0500
Subject: [PATCH 3/5] s/arrays/lists/

---
 .../2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst
index 7d0fb033af0299..22dda2a3e972a8 100644
--- a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst
@@ -1 +1 @@
-Sorting randomly ordered arrays will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible.
+Sorting randomly ordered lists will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible.

From 8138c2e6a664d43e322802cadc2bf995fb2b280d Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 16 Jun 2025 13:10:56 -0500
Subject: [PATCH 4/5] Fold in Stefan's suggestions, and minor cleanup.

---
 Objects/listobject.c |  23 ++++------
 Objects/listsort.txt | 107 +++++++++++++++++++++++++++++++------------
 2 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 49683e5e005532..d76ee529a43e65 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1747,10 +1747,9 @@ struct s_MergeState {
     int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
 
     /* Varisbles used for minrun computation. The "ideal" minrun length is
-     * the infinite precision listlen / 2**e, which is represented as the
-     * marhematical value of mr_int + mr_frac / 2**e.
+     * the infinite precision listlen / 2**e. See listlen.txt.
      */
-     Py_ssize_t mr_int, mr_frac, mr_current_frac, mr_e, mr_mask;
+     Py_ssize_t mr_current, mr_e, mr_mask;
 };
 
 /* binarysort is the best method for sorting small arrays: it does few
@@ -2213,15 +2212,13 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
     ms->listlen = list_size;
     ms->basekeys = lo->keys;
 
-    ms->mr_int = list_size;
+    /* State for generating minrun values. See listsort.txt. */
     ms->mr_e = 0;
-    while (ms->mr_int >= MAX_MINRUN) {
-        ms->mr_int >>= 1;
+    while (list_size >> ms->mr_e >= MAX_MINRUN) {
         ++ms->mr_e;
     }
     ms->mr_mask = (1 << ms->mr_e) - 1;
-    ms->mr_frac = list_size & ms->mr_mask;
-    ms->mr_current_frac = 0;
+    ms->mr_current = 0;
 }
 
 /* Free all the temp memory owned by the MergeState.  This must be called
@@ -2700,13 +2697,13 @@ merge_force_collapse(MergeState *ms)
 }
 
 /* Return the next minrun value to use. See listsort.txt. */
-static inline Py_ssize_t
+Py_LOCAL_INLINE(Py_ssize_t)
 minrun_next(MergeState *ms)
 {
-    ms->mr_current_frac += ms->mr_frac;
-    assert(ms->mr_current_frac >> ms->mr_e <= 1);
-    Py_ssize_t result = ms->mr_int + (ms->mr_current_frac >> ms->mr_e);
-    ms->mr_current_frac &= ms->mr_mask;
+    ms->mr_current += ms->listlen;
+    assert(ms->mr_current >= 0); /* no overflow */
+    Py_ssize_t result = ms->mr_current >> ms->mr_e;
+    ms->mr_current &= ms->mr_mask;
     return result;
 }
 
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 2d62606f3461b4..5b2fc7d50a25ca 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -316,24 +316,78 @@ merge tree:
 So, in all respects, as perfectly balanced as possible.
 
 For the 2112 case, that also keeps minrun at 33, but we were lucky there
-that 2112 is a power of 2 times 33. The new approach doesn't rely on luck.
+that 2112 is 33 times a power of 2. The new approach doesn't rely on luck.
 
-The basic idea is to conceive of the ideal run length as being a real number
-rather than just an integer. For an array of length `n`, let `e` be the
-smallest int such that n/2**e < MAX_MINRUN. Then mr = n/2**e is the ideal
-run length, and obviously mr * 2**e is n, so there are exactly 2**e runs.
+For example, with 315 random elements, the old scheme uses fixed minrun=40 and
+produces runs of length 40, except for the last. The new scheme produces a
+mix of lengths 39 and 40:
 
-Of course runs can't have a fractional length, so we start the i'th (zero-
-based) run at index int(mr * i), for i in range(2**e). The differences between
-adjacent starting indices are the run lengths, and it's left as an exercise
-for the reader to show that they have the nice properties listed above. See
-note MINRUN CODE for an executable Python implementation to help make it all
-concrete.
+old:  40 40 40 40 40 40 40 35
+new:  39 39 40 39 39 40 39 40
 
-The code doesn't actually compute the starting indices, or use floats. Instead
-mr is represented as a pair of integers such that the infinite precision mr is
-equal to mr_int + mr_frac / 2**e, and only the delta (run length) from one
-index to the next is computed.
+Both schemes produce eight runs, a power of 2. That's good for a balanced
+merge tree. But the new scheme allows merges where left and right length
+never differ by more than 1:
+
+39 39 40 39 39 40 39 40
+  78   79     79   79
+    157         158
+          315
+
+(This shows merges downward, e.g., two runs of length 39 are merged and
+become a run of length 78.)
+
+With larger lists, the old scheme can get even more unbalanced. For example,
+with 32769 elements (that's 2**15 + 1), it uses minrun=33 and produces 993
+runs (of length 33). That's not even a power of 2. The new scheme instead
+produces 1024 runs, all with length 32 except for the last one with length 33.
+
+How does it work? Ideally, all runs would be exactly equally long. For the
+above example, each run would have 315/8 = 39.375 elements. Which of course
+doesn't work. But we can get close:
+
+For the first run, we'd like 39.375 elements. Since that's impossible, we
+instead use 39 (the floor) and remember the current leftover fraction 0.375.
+For the second run, we add 0.375 + 39.375 = 39.75. Again impossible, so we
+instead use 39 and remember 0.75. For the third run, we add 0.75 + 39.375 =
+40.125. This time we get 40 and remember 0.125. And so on. Here's a Python
+generator doing that:
+
+def gen_minruns_with_floats(n):
+    mr = n
+    while mr >= MAX_MINRUN:
+        mr /= 2
+
+    mr_current = 0
+    while True:
+        mr_current += mr
+        yield int(mr_current)
+        mr_current %= 1
+
+But while all arithmetic here can be done exactly using binery floating point,
+floats have less precision that a Py_ssize_t, and mixing floats with ints is
+needlessly expensive anyway.
+
+So here's an integer version, where the internal numbers are scaled up by
+2**e, or rather not divided by 2**e. Instead, only each yielded minrun gets
+divided (by right-shifting). For example instead of adding 39.375 and
+reducing modulo 1, it just adds 315 and reduces modulo 8. And always divides
+by 8 to get each actual minrun value:
+
+def gen_minruns_simpler(n):
+    e = 0
+    while (n >> e) >= MAX_MINRUN:
+        e += 1
+    mask = (1 << e) - 1
+
+    mr_current = 0
+    while True:
+        mr_current += n
+        yield mr_current >> e
+        mr_current &= mask
+
+See note MINRUN CODE for a full implementation and a driver that exhaustively
+verifies the claims above for all list lengths through 2 million.
 
 
 The Merge Pattern
@@ -852,23 +906,18 @@ except ImportError:
 MAX_MINRUN = 64
 
 def gen_minruns(n):
-    # mr_int = minrun's integral part
-    # mr_frac = minrun's fractional part with mr_e bits and
-    # mask mr_mask
-    mr_int = n
+    # In listobject.c, initialization is done in merge_init(), and
+    # the body of the loop in minrun_next().
     mr_e = 0
-    while mr_int >= MAX_MINRUN:
-        mr_int >>= 1
+    while (n >> mr_e) >= MAX_MINRUN:
         mr_e += 1
     mr_mask = (1 << mr_e) - 1
-    mr_frac = n & mr_mask
 
-    mr_current_frac = 0
+    mr_current = 0
     while True:
-        mr_current_frac += mr_frac
-        assert mr_current_frac >> mr_e <= 1
-        yield mr_int + (mr_current_frac >> mr_e)
-        mr_current_frac &= mr_mask
+        mr_current += n
+        yield mr_current >> mr_e
+        mr_current &= mr_mask
 
 def chew(n, show=False):
     if n < 1:
@@ -884,7 +933,7 @@ def chew(n, show=False):
     assert tot == n
     print(n, len(sizes))
 
-    small, large = 32, 64
+    small, large = MAX_MINRUN // 2, MAX_MINRUN
     while len(sizes) > 1:
         assert not len(sizes) & 1
         assert len(sizes).bit_count() == 1 # i.e., power of 2
@@ -913,4 +962,4 @@ def chew(n, show=False):
     assert sizes[0] == n
 
 for n in range(2_000_001):
-    chew(n)
+    chew(n)
\ No newline at end of file

From a07d23c11ca7f37db5f22cbc6e90f170a7559bc9 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 16 Jun 2025 13:15:17 -0500
Subject: [PATCH 5/5] Typo repair.

---
 Objects/listobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index d76ee529a43e65..46a71db7b09385 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1747,7 +1747,7 @@ struct s_MergeState {
     int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
 
     /* Varisbles used for minrun computation. The "ideal" minrun length is
-     * the infinite precision listlen / 2**e. See listlen.txt.
+     * the infinite precision listlen / 2**e. See listsort.txt.
      */
      Py_ssize_t mr_current, mr_e, mr_mask;
 };