From 0bd2fd757913e6519e3b3a1b05bfcaba927ca21e Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 15 Jun 2025 22:46:35 -0500 Subject: [PATCH 1/5] Initial stab at implementing Stefan Pochmann's spiffy new minrun scheme. --- Misc/ACKS | 1 + Objects/listobject.c | 51 +++++++++--------- Objects/listsort.txt | 126 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 137 insertions(+), 41 deletions(-) diff --git a/Misc/ACKS b/Misc/ACKS index d4557a03eb5400..f6cc0b54cbed4e 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1480,6 +1480,7 @@ Jean-François Piéronne Oleg Plakhotnyuk Anatoliy Platonov Marcel Plch +Stefan Pochmann Kirill Podoprigora Remi Pointel Jon Poler diff --git a/Objects/listobject.c b/Objects/listobject.c index c5895645a2dd12..49683e5e005532 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1684,10 +1684,7 @@ sortslice_advance(sortslice *slice, Py_ssize_t n) /* Avoid malloc for small temp arrays. */ #define MERGESTATE_TEMP_SIZE 256 -/* The largest value of minrun. This must be a power of 2, and >= 1, so that - * the compute_minrun() algorithm guarantees to return a result no larger than - * this, - */ +/* The largest value of minrun. This must be a power of 2, and >= 1 */ #define MAX_MINRUN 64 #if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1)) #error "MAX_MINRUN must be a power of 2, and >= 1" @@ -1748,6 +1745,12 @@ struct s_MergeState { * of tuples. It may be set to safe_object_compare, but the idea is that hopefully * we can assume more, and use one of the special-case compares. */ int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *); + + /* Varisbles used for minrun computation. The "ideal" minrun length is + * the infinite precision listlen / 2**e, which is represented as the + * marhematical value of mr_int + mr_frac / 2**e. + */ + Py_ssize_t mr_int, mr_frac, mr_current_frac, mr_e, mr_mask; }; /* binarysort is the best method for sorting small arrays: it does few @@ -2209,6 +2212,16 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc, ms->min_gallop = MIN_GALLOP; ms->listlen = list_size; ms->basekeys = lo->keys; + + ms->mr_int = list_size; + ms->mr_e = 0; + while (ms->mr_int >= MAX_MINRUN) { + ms->mr_int >>= 1; + ++ms->mr_e; + } + ms->mr_mask = (1 << ms->mr_e) - 1; + ms->mr_frac = list_size & ms->mr_mask; + ms->mr_current_frac = 0; } /* Free all the temp memory owned by the MergeState. This must be called @@ -2686,27 +2699,15 @@ merge_force_collapse(MergeState *ms) return 0; } -/* Compute a good value for the minimum run length; natural runs shorter - * than this are boosted artificially via binary insertion. - * - * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff). - * Else if n is an exact power of 2, return MAX_MINRUN / 2. - * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is - * close to, but strictly less than, an exact power of 2. - * - * See listsort.txt for more info. - */ -static Py_ssize_t -merge_compute_minrun(Py_ssize_t n) +/* Return the next minrun value to use. See listsort.txt. */ +static inline Py_ssize_t +minrun_next(MergeState *ms) { - Py_ssize_t r = 0; /* becomes 1 if any 1 bits are shifted off */ - - assert(n >= 0); - while (n >= MAX_MINRUN) { - r |= n & 1; - n >>= 1; - } - return n + r; + ms->mr_current_frac += ms->mr_frac; + assert(ms->mr_current_frac >> ms->mr_e <= 1); + Py_ssize_t result = ms->mr_int + (ms->mr_current_frac >> ms->mr_e); + ms->mr_current_frac &= ms->mr_mask; + return result; } /* Here we define custom comparison functions to optimize for the cases one commonly @@ -3074,7 +3075,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) /* March over the array once, left to right, finding natural runs, * and extending short natural runs to minrun elements. */ - minrun = merge_compute_minrun(nremaining); do { Py_ssize_t n; @@ -3083,6 +3083,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) if (n < 0) goto fail; /* If short, extend to min(minrun, nremaining). */ + minrun = minrun_next(&ms); if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? nremaining : minrun; diff --git a/Objects/listsort.txt b/Objects/listsort.txt index f387d9c116e502..2d62606f3461b4 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -270,8 +270,8 @@ result. This has two primary good effects: Computing minrun ---------------- -If N < MAX_MINRUN, minrun is N. IOW, binary insertion sort is used for the -whole array then; it's hard to beat that given the overheads of trying +If N < MAX_MINRUN, minrun is N. IOW, binary insertion sort is used for the +whole array then; it's hard to beat that given the overheads of trying something fancier (see note BINSORT). When N is a power of 2, testing on random data showed that minrun values of @@ -288,7 +288,6 @@ that 32 isn't a good choice for the general case! Consider N=2112: >>> divmod(2112, 32) (66, 0) ->>> If the data is randomly ordered, we're very likely to end up with 66 runs each of length 32. The first 64 of these trigger a sequence of perfectly @@ -301,22 +300,40 @@ to get 64 elements into place). If we take minrun=33 in this case, then we're very likely to end up with 64 runs each of length 33, and then all merges are perfectly balanced. Better! -What we want to avoid is picking minrun such that in +The original code used a cheap heuristic to pick a minrun that avoided the +very worst cases of imbalance for the final merge, but "pretty bad" cases +still existed. - q, r = divmod(N, minrun) +In 2025, Stefan Pochmann found a much better approach, based on letting minrun +vary a bit from one run to the next. Under his scheme, at _all_ levels of the +merge tree: -q is a power of 2 and r>0 (then the last merge only gets r elements into -place, and r < minrun is small compared to N), or q a little larger than a -power of 2 regardless of r (then we've got a case similar to "2112", again -leaving too little work for the last merge to do). +- The number of runs is a power of 2. +- At most two different run lengths appear. +- When two do appear, the smaller is one less than the larger. +- The lengths of run pairs merged never differ by more than one. -Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that -N/minrun is exactly a power of 2, or if that isn't possible, is close to, but -strictly less than, a power of 2. This is easier to do than it may sound: -take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining -bits are set. In fact, that rule covers every case in this section, including -small N and exact powers of 2; merge_compute_minrun() is a deceptively simple -function. +So, in all respects, as perfectly balanced as possible. + +For the 2112 case, that also keeps minrun at 33, but we were lucky there +that 2112 is a power of 2 times 33. The new approach doesn't rely on luck. + +The basic idea is to conceive of the ideal run length as being a real number +rather than just an integer. For an array of length `n`, let `e` be the +smallest int such that n/2**e < MAX_MINRUN. Then mr = n/2**e is the ideal +run length, and obviously mr * 2**e is n, so there are exactly 2**e runs. + +Of course runs can't have a fractional length, so we start the i'th (zero- +based) run at index int(mr * i), for i in range(2**e). The differences between +adjacent starting indices are the run lengths, and it's left as an exercise +for the reader to show that they have the nice properties listed above. See +note MINRUN CODE for an executable Python implementation to help make it all +concrete. + +The code doesn't actually compute the starting indices, or use floats. Instead +mr is represented as a pair of integers such that the infinite precision mr is +equal to mr_int + mr_frac / 2**e, and only the delta (run length) from one +index to the next is computed. The Merge Pattern @@ -820,3 +837,80 @@ partially mitigated by pre-scanning the data to determine whether the data is homogeneous with respect to type. If so, it is sometimes possible to substitute faster type-specific comparisons for the slower, generic PyObject_RichCompareBool. + +MINRUN CODE +from itertools import accumulate +try: + from itertools import batched +except ImportError: + from itertools import islice + def batched(xs, k): + it = iter(xs) + while chunk := tuple(islice(it, k)): + yield chunk + +MAX_MINRUN = 64 + +def gen_minruns(n): + # mr_int = minrun's integral part + # mr_frac = minrun's fractional part with mr_e bits and + # mask mr_mask + mr_int = n + mr_e = 0 + while mr_int >= MAX_MINRUN: + mr_int >>= 1 + mr_e += 1 + mr_mask = (1 << mr_e) - 1 + mr_frac = n & mr_mask + + mr_current_frac = 0 + while True: + mr_current_frac += mr_frac + assert mr_current_frac >> mr_e <= 1 + yield mr_int + (mr_current_frac >> mr_e) + mr_current_frac &= mr_mask + +def chew(n, show=False): + if n < 1: + return + + sizes = [] + tot = 0 + for size in gen_minruns(n): + sizes.append(size) + tot += size + if tot >= n: + break + assert tot == n + print(n, len(sizes)) + + small, large = 32, 64 + while len(sizes) > 1: + assert not len(sizes) & 1 + assert len(sizes).bit_count() == 1 # i.e., power of 2 + assert sum(sizes) == n + assert min(sizes) >= min(n, small) + assert max(sizes) <= large + + d = set(sizes) + assert len(d) <= 2 + if len(d) == 2: + lo, hi = sorted(d) + assert lo + 1 == hi + + mr = n / len(sizes) + for i, s in enumerate(accumulate(sizes, initial=0)): + assert int(mr * i) == s + + newsizes = [] + for a, b in batched(sizes, 2): + assert abs(a - b) <= 1 + newsizes.append(a + b) + sizes = newsizes + smsll = large + large *= 2 + + assert sizes[0] == n + +for n in range(2_000_001): + chew(n) From b4e4451c46a24c44521bc2988ac5170632345166 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 16 Jun 2025 03:56:20 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst new file mode 100644 index 00000000000000..7d0fb033af0299 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst @@ -0,0 +1 @@ +Sorting randomly ordered arrays will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible. From 835190d7d6aa6a11ce966c6a29c3d68cf5149c7e Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sun, 15 Jun 2025 23:00:50 -0500 Subject: [PATCH 3/5] s/arrays/lists/ --- .../2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst index 7d0fb033af0299..22dda2a3e972a8 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-16-03-56-15.gh-issue-135551.hRTQO-.rst @@ -1 +1 @@ -Sorting randomly ordered arrays will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible. +Sorting randomly ordered lists will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible. From 8138c2e6a664d43e322802cadc2bf995fb2b280d Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 16 Jun 2025 13:10:56 -0500 Subject: [PATCH 4/5] Fold in Stefan's suggestions, and minor cleanup. --- Objects/listobject.c | 23 ++++------ Objects/listsort.txt | 107 +++++++++++++++++++++++++++++++------------ 2 files changed, 88 insertions(+), 42 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 49683e5e005532..d76ee529a43e65 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1747,10 +1747,9 @@ struct s_MergeState { int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *); /* Varisbles used for minrun computation. The "ideal" minrun length is - * the infinite precision listlen / 2**e, which is represented as the - * marhematical value of mr_int + mr_frac / 2**e. + * the infinite precision listlen / 2**e. See listlen.txt. */ - Py_ssize_t mr_int, mr_frac, mr_current_frac, mr_e, mr_mask; + Py_ssize_t mr_current, mr_e, mr_mask; }; /* binarysort is the best method for sorting small arrays: it does few @@ -2213,15 +2212,13 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc, ms->listlen = list_size; ms->basekeys = lo->keys; - ms->mr_int = list_size; + /* State for generating minrun values. See listsort.txt. */ ms->mr_e = 0; - while (ms->mr_int >= MAX_MINRUN) { - ms->mr_int >>= 1; + while (list_size >> ms->mr_e >= MAX_MINRUN) { ++ms->mr_e; } ms->mr_mask = (1 << ms->mr_e) - 1; - ms->mr_frac = list_size & ms->mr_mask; - ms->mr_current_frac = 0; + ms->mr_current = 0; } /* Free all the temp memory owned by the MergeState. This must be called @@ -2700,13 +2697,13 @@ merge_force_collapse(MergeState *ms) } /* Return the next minrun value to use. See listsort.txt. */ -static inline Py_ssize_t +Py_LOCAL_INLINE(Py_ssize_t) minrun_next(MergeState *ms) { - ms->mr_current_frac += ms->mr_frac; - assert(ms->mr_current_frac >> ms->mr_e <= 1); - Py_ssize_t result = ms->mr_int + (ms->mr_current_frac >> ms->mr_e); - ms->mr_current_frac &= ms->mr_mask; + ms->mr_current += ms->listlen; + assert(ms->mr_current >= 0); /* no overflow */ + Py_ssize_t result = ms->mr_current >> ms->mr_e; + ms->mr_current &= ms->mr_mask; return result; } diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 2d62606f3461b4..5b2fc7d50a25ca 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -316,24 +316,78 @@ merge tree: So, in all respects, as perfectly balanced as possible. For the 2112 case, that also keeps minrun at 33, but we were lucky there -that 2112 is a power of 2 times 33. The new approach doesn't rely on luck. +that 2112 is 33 times a power of 2. The new approach doesn't rely on luck. -The basic idea is to conceive of the ideal run length as being a real number -rather than just an integer. For an array of length `n`, let `e` be the -smallest int such that n/2**e < MAX_MINRUN. Then mr = n/2**e is the ideal -run length, and obviously mr * 2**e is n, so there are exactly 2**e runs. +For example, with 315 random elements, the old scheme uses fixed minrun=40 and +produces runs of length 40, except for the last. The new scheme produces a +mix of lengths 39 and 40: -Of course runs can't have a fractional length, so we start the i'th (zero- -based) run at index int(mr * i), for i in range(2**e). The differences between -adjacent starting indices are the run lengths, and it's left as an exercise -for the reader to show that they have the nice properties listed above. See -note MINRUN CODE for an executable Python implementation to help make it all -concrete. +old: 40 40 40 40 40 40 40 35 +new: 39 39 40 39 39 40 39 40 -The code doesn't actually compute the starting indices, or use floats. Instead -mr is represented as a pair of integers such that the infinite precision mr is -equal to mr_int + mr_frac / 2**e, and only the delta (run length) from one -index to the next is computed. +Both schemes produce eight runs, a power of 2. That's good for a balanced +merge tree. But the new scheme allows merges where left and right length +never differ by more than 1: + +39 39 40 39 39 40 39 40 + 78 79 79 79 + 157 158 + 315 + +(This shows merges downward, e.g., two runs of length 39 are merged and +become a run of length 78.) + +With larger lists, the old scheme can get even more unbalanced. For example, +with 32769 elements (that's 2**15 + 1), it uses minrun=33 and produces 993 +runs (of length 33). That's not even a power of 2. The new scheme instead +produces 1024 runs, all with length 32 except for the last one with length 33. + +How does it work? Ideally, all runs would be exactly equally long. For the +above example, each run would have 315/8 = 39.375 elements. Which of course +doesn't work. But we can get close: + +For the first run, we'd like 39.375 elements. Since that's impossible, we +instead use 39 (the floor) and remember the current leftover fraction 0.375. +For the second run, we add 0.375 + 39.375 = 39.75. Again impossible, so we +instead use 39 and remember 0.75. For the third run, we add 0.75 + 39.375 = +40.125. This time we get 40 and remember 0.125. And so on. Here's a Python +generator doing that: + +def gen_minruns_with_floats(n): + mr = n + while mr >= MAX_MINRUN: + mr /= 2 + + mr_current = 0 + while True: + mr_current += mr + yield int(mr_current) + mr_current %= 1 + +But while all arithmetic here can be done exactly using binery floating point, +floats have less precision that a Py_ssize_t, and mixing floats with ints is +needlessly expensive anyway. + +So here's an integer version, where the internal numbers are scaled up by +2**e, or rather not divided by 2**e. Instead, only each yielded minrun gets +divided (by right-shifting). For example instead of adding 39.375 and +reducing modulo 1, it just adds 315 and reduces modulo 8. And always divides +by 8 to get each actual minrun value: + +def gen_minruns_simpler(n): + e = 0 + while (n >> e) >= MAX_MINRUN: + e += 1 + mask = (1 << e) - 1 + + mr_current = 0 + while True: + mr_current += n + yield mr_current >> e + mr_current &= mask + +See note MINRUN CODE for a full implementation and a driver that exhaustively +verifies the claims above for all list lengths through 2 million. The Merge Pattern @@ -852,23 +906,18 @@ except ImportError: MAX_MINRUN = 64 def gen_minruns(n): - # mr_int = minrun's integral part - # mr_frac = minrun's fractional part with mr_e bits and - # mask mr_mask - mr_int = n + # In listobject.c, initialization is done in merge_init(), and + # the body of the loop in minrun_next(). mr_e = 0 - while mr_int >= MAX_MINRUN: - mr_int >>= 1 + while (n >> mr_e) >= MAX_MINRUN: mr_e += 1 mr_mask = (1 << mr_e) - 1 - mr_frac = n & mr_mask - mr_current_frac = 0 + mr_current = 0 while True: - mr_current_frac += mr_frac - assert mr_current_frac >> mr_e <= 1 - yield mr_int + (mr_current_frac >> mr_e) - mr_current_frac &= mr_mask + mr_current += n + yield mr_current >> mr_e + mr_current &= mr_mask def chew(n, show=False): if n < 1: @@ -884,7 +933,7 @@ def chew(n, show=False): assert tot == n print(n, len(sizes)) - small, large = 32, 64 + small, large = MAX_MINRUN // 2, MAX_MINRUN while len(sizes) > 1: assert not len(sizes) & 1 assert len(sizes).bit_count() == 1 # i.e., power of 2 @@ -913,4 +962,4 @@ def chew(n, show=False): assert sizes[0] == n for n in range(2_000_001): - chew(n) + chew(n) \ No newline at end of file From a07d23c11ca7f37db5f22cbc6e90f170a7559bc9 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Mon, 16 Jun 2025 13:15:17 -0500 Subject: [PATCH 5/5] Typo repair. --- Objects/listobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index d76ee529a43e65..46a71db7b09385 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1747,7 +1747,7 @@ struct s_MergeState { int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *); /* Varisbles used for minrun computation. The "ideal" minrun length is - * the infinite precision listlen / 2**e. See listlen.txt. + * the infinite precision listlen / 2**e. See listsort.txt. */ Py_ssize_t mr_current, mr_e, mr_mask; };