From 4ad778b996f0c014ff3f15fa2feb14b547965830 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 6 May 2020 14:58:32 -0700 Subject: [PATCH 01/10] Add weights to random.sample() --- Lib/random.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Lib/random.py b/Lib/random.py index f2c4f39fb6079d..d4739d97680e12 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -331,7 +331,7 @@ def shuffle(self, x, random=None): j = _int(random() * (i+1)) x[i], x[j] = x[j], x[i] - def sample(self, population, k): + def sample(self, population, k, *, weights=None): """Chooses k unique random elements from a population sequence or set. Returns a new list containing elements from the population while @@ -340,6 +340,10 @@ def sample(self, population, k): samples. This allows raffle winners (the sample) to be partitioned into grand prize and second place winners (the subslices). + If weights are given, they must be non-negative integer counts. + Each selection effectively reduces the count by one, lowering + the probablity for the next selection. + Members of the population need not be hashable or unique. If the population contains repeats, then each occurrence is a possible selection in the sample. @@ -379,6 +383,16 @@ def sample(self, population, k): population = tuple(population) if not isinstance(population, _Sequence): raise TypeError("Population must be a sequence. For dicts or sets, use sorted(d).") + if weights is not None: + cum_weights = list(_accumulate(weights)) + total = cum_weights.pop() + if not isinstance(total, int): + raise TypeError('Weights must be integers') + if total < 0: + raise ValueError('Total of weights must be greater than zero') + selections = sample(range(total), k=k) + bisect = _bisect + return [population[bisect(cum_weights, s)] for s in selections] randbelow = self._randbelow n = len(population) if not 0 <= k <= n: From eaed7b3bab73a37f98eb750ef6f1091cd537c7b3 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 6 May 2020 15:21:45 -0700 Subject: [PATCH 02/10] Add docs --- Doc/library/random.rst | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Doc/library/random.rst b/Doc/library/random.rst index f37bc2a111d954..0f91a6e25f830c 100644 --- a/Doc/library/random.rst +++ b/Doc/library/random.rst @@ -217,7 +217,7 @@ Functions for sequences The optional parameter *random*. -.. function:: sample(population, k) +.. function:: sample(population, k, *, weights=None) Return a *k* length list of unique elements chosen from the population sequence or set. Used for random sampling without replacement. @@ -231,6 +231,10 @@ Functions for sequences Members of the population need not be :term:`hashable` or unique. If the population contains repeats, then each occurrence is a possible selection in the sample. + If *weights* are given, they must be non-negative integer counts. + Each selection effectively reduces the count by one, lowering + the probablity for the next selection. + To choose a sample from a range of integers, use a :func:`range` object as an argument. This is especially fast and space efficient for sampling from a large population: ``sample(range(10000000), k=60)``. @@ -238,6 +242,9 @@ Functions for sequences If the sample size is larger than the population size, a :exc:`ValueError` is raised. + .. versionchanged 3.9 + Added the *weights* parameter. + .. deprecated:: 3.9 In the future, the *population* must be a sequence. Instances of :class:`set` are no longer supported. The set must first be converted @@ -420,12 +427,11 @@ Simulations:: >>> choices(['red', 'black', 'green'], [18, 18, 2], k=6) ['red', 'green', 'black', 'black', 'red', 'black'] - >>> # Deal 20 cards without replacement from a deck of 52 playing cards - >>> # and determine the proportion of cards with a ten-value - >>> # (a ten, jack, queen, or king). - >>> deck = collections.Counter(tens=16, low_cards=36) - >>> seen = sample(list(deck.elements()), k=20) - >>> seen.count('tens') / 20 + >>> # Deal 20 cards without replacement from a deck + >>> # of 52 playing cards, and determine the proportion of cards + >>> # with a ten-value: ten, jack, queen, or king. + >>> dealt = sample(['tens', 'low cards'], weights=[16, 36], k=20) + >>> dealt.count('tens') / 20 0.15 >>> # Estimate the probability of getting 5 or more heads from 7 spins From 787ec385813cad625025e8c5a8922a0870994dea Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 6 May 2020 15:36:58 -0700 Subject: [PATCH 03/10] Add blurb --- .../NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst diff --git a/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst b/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst new file mode 100644 index 00000000000000..543ef8a2d7df79 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst @@ -0,0 +1 @@ +Added an optional *weights* parameter to random.sample(). From ab6b380d020bf7c5d7b6521711bb425295701de0 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 6 May 2020 16:43:35 -0700 Subject: [PATCH 04/10] Add tests --- Lib/random.py | 4 +++- Lib/test/test_random.py | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Lib/random.py b/Lib/random.py index d4739d97680e12..44b567601f85e0 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -383,8 +383,11 @@ def sample(self, population, k, *, weights=None): population = tuple(population) if not isinstance(population, _Sequence): raise TypeError("Population must be a sequence. For dicts or sets, use sorted(d).") + n = len(population) if weights is not None: cum_weights = list(_accumulate(weights)) + if len(cum_weights) != n: + raise ValueError('The number of weights does not match the population') total = cum_weights.pop() if not isinstance(total, int): raise TypeError('Weights must be integers') @@ -394,7 +397,6 @@ def sample(self, population, k, *, weights=None): bisect = _bisect return [population[bisect(cum_weights, s)] for s in selections] randbelow = self._randbelow - n = len(population) if not 0 <= k <= n: raise ValueError("Sample larger than population or is negative") result = [None] * k diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py index bb95ca0884a516..ff564657677a77 100644 --- a/Lib/test/test_random.py +++ b/Lib/test/test_random.py @@ -9,7 +9,7 @@ from math import log, exp, pi, fsum, sin, factorial from test import support from fractions import Fraction - +from collections import Counter class TestBasicOps: # Superclass with tests common to all generators. @@ -161,6 +161,43 @@ def test_sample_on_sets(self): population = {10, 20, 30, 40, 50, 60, 70} self.gen.sample(population, k=5) + def test_sample_with_weights(self): + sample = self.gen.sample + + # General case + colors = ['red', 'green', 'blue', 'orange', 'black', 'brown', 'amber'] + weights = [500, 200, 20, 10, 5, 0, 1 ] + k = 700 + summary = Counter(sample(colors, weights=weights, k=k)) + self.assertEqual(sum(summary.values()), k) + for color, weight in zip(colors, weights): + self.assertLessEqual(summary[color], weight) + self.assertNotIn('brown', summary) + + # Case that exhausts the population + k = sum(weights) + summary = Counter(sample(colors, weights=weights, k=k)) + self.assertEqual(sum(summary.values()), k) + for color, weight in zip(colors, weights): + self.assertLessEqual(summary[color], weight) + self.assertNotIn('brown', summary) + + # Case with population size of 1 + summary = Counter(sample(['x'], weights=[10], k=8)) + self.assertEqual(summary, Counter(x=8)) + + # Test error handling + with self.assertRaises(TypeError): + sample(['red', 'green', 'blue'], weights=10, k=10) # weights not iterable + with self.assertRaises(ValueError): + sample(['red', 'green', 'blue'], weights=[-3, -7, -8], k=2) # weights are negative + with self.assertRaises(ValueError): + sample(['red', 'green'], weights=[10, 10], k=21) # population too small + with self.assertRaises(ValueError): + sample(['red', 'green', 'blue'], weights=[1, 2], k=2) # too few weights + with self.assertRaises(ValueError): + sample(['red', 'green', 'blue'], weights=[1, 2, 3, 4], k=2) # too many weights + def test_choices(self): choices = self.gen.choices data = ['red', 'green', 'blue', 'yellow'] From ea5e42ee089a8a0536fc5b95ca48ffab645d1853 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Wed, 6 May 2020 16:45:12 -0700 Subject: [PATCH 05/10] Update Doc/library/random.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rémi Lapeyre --- Doc/library/random.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/random.rst b/Doc/library/random.rst index 0f91a6e25f830c..cf02c66340d3ba 100644 --- a/Doc/library/random.rst +++ b/Doc/library/random.rst @@ -242,7 +242,7 @@ Functions for sequences If the sample size is larger than the population size, a :exc:`ValueError` is raised. - .. versionchanged 3.9 + .. versionchanged:: 3.9 Added the *weights* parameter. .. deprecated:: 3.9 From 672f739050b398b806a798eb598fa2eace391790 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 7 May 2020 01:09:28 -0700 Subject: [PATCH 06/10] Add exact equidistribution test --- Lib/test/test_random.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py index ff564657677a77..d48a596554be9d 100644 --- a/Lib/test/test_random.py +++ b/Lib/test/test_random.py @@ -186,6 +186,11 @@ def test_sample_with_weights(self): summary = Counter(sample(['x'], weights=[10], k=8)) self.assertEqual(summary, Counter(x=8)) + # Case with all weights equal. + nc = len(colors) + summary = Counter(sample(colors, weights=[10]*nc, k=10*nc)) + self.assertEqual(summary, Counter(10*colors)) + # Test error handling with self.assertRaises(TypeError): sample(['red', 'green', 'blue'], weights=10, k=10) # weights not iterable From 556582d25bacb12b5a7c013a898679a51d88c625 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 7 May 2020 07:57:58 -0700 Subject: [PATCH 07/10] Earlier test for zero total weight --- Lib/random.py | 2 +- Lib/test/test_random.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/random.py b/Lib/random.py index 44b567601f85e0..3004e766ee822f 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -391,7 +391,7 @@ def sample(self, population, k, *, weights=None): total = cum_weights.pop() if not isinstance(total, int): raise TypeError('Weights must be integers') - if total < 0: + if total <= 0: raise ValueError('Total of weights must be greater than zero') selections = sample(range(total), k=k) bisect = _bisect diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py index d48a596554be9d..c34a950890e1c5 100644 --- a/Lib/test/test_random.py +++ b/Lib/test/test_random.py @@ -196,6 +196,8 @@ def test_sample_with_weights(self): sample(['red', 'green', 'blue'], weights=10, k=10) # weights not iterable with self.assertRaises(ValueError): sample(['red', 'green', 'blue'], weights=[-3, -7, -8], k=2) # weights are negative + with self.assertRaises(ValueError): + sample(['red', 'green', 'blue'], weights=[0, 0, 0], k=2) # weights are zero with self.assertRaises(ValueError): sample(['red', 'green'], weights=[10, 10], k=21) # population too small with self.assertRaises(ValueError): From 4fb554a50aa7c2ddd0e2a724eafec6fa92ed5df3 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 7 May 2020 13:51:29 -0700 Subject: [PATCH 08/10] Change parameter name from "weights" to "counts". Add another test. --- Doc/library/random.rst | 13 ++-- Lib/random.py | 20 +++--- Lib/test/test_random.py | 61 +++++++++++++------ .../2020-05-06-15-36-47.bpo-40541.LlYghL.rst | 2 +- 4 files changed, 62 insertions(+), 34 deletions(-) diff --git a/Doc/library/random.rst b/Doc/library/random.rst index cf02c66340d3ba..90366f499cae6a 100644 --- a/Doc/library/random.rst +++ b/Doc/library/random.rst @@ -217,7 +217,7 @@ Functions for sequences The optional parameter *random*. -.. function:: sample(population, k, *, weights=None) +.. function:: sample(population, k, *, counts=None) Return a *k* length list of unique elements chosen from the population sequence or set. Used for random sampling without replacement. @@ -231,9 +231,10 @@ Functions for sequences Members of the population need not be :term:`hashable` or unique. If the population contains repeats, then each occurrence is a possible selection in the sample. - If *weights* are given, they must be non-negative integer counts. - Each selection effectively reduces the count by one, lowering - the probablity for the next selection. + Repeated elements can be specified one at a time or with the optional + keyword-only *counts* parameter. For example, ``sample(['red', 'blue'], + counts=[4, 2], k=5)`` is equivalent to ``sample(['red', 'red', 'red', 'red', + 'blue', 'blue'], k=5)``. To choose a sample from a range of integers, use a :func:`range` object as an argument. This is especially fast and space efficient for sampling from a large @@ -243,7 +244,7 @@ Functions for sequences is raised. .. versionchanged:: 3.9 - Added the *weights* parameter. + Added the *counts* parameter. .. deprecated:: 3.9 In the future, the *population* must be a sequence. Instances of @@ -430,7 +431,7 @@ Simulations:: >>> # Deal 20 cards without replacement from a deck >>> # of 52 playing cards, and determine the proportion of cards >>> # with a ten-value: ten, jack, queen, or king. - >>> dealt = sample(['tens', 'low cards'], weights=[16, 36], k=20) + >>> dealt = sample(['tens', 'low cards'], counts=[16, 36], k=20) >>> dealt.count('tens') / 20 0.15 diff --git a/Lib/random.py b/Lib/random.py index 3004e766ee822f..4692541090421f 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -331,7 +331,7 @@ def shuffle(self, x, random=None): j = _int(random() * (i+1)) x[i], x[j] = x[j], x[i] - def sample(self, population, k, *, weights=None): + def sample(self, population, k, *, counts=None): """Chooses k unique random elements from a population sequence or set. Returns a new list containing elements from the population while @@ -340,7 +340,7 @@ def sample(self, population, k, *, weights=None): samples. This allows raffle winners (the sample) to be partitioned into grand prize and second place winners (the subslices). - If weights are given, they must be non-negative integer counts. + If counts are given, they must be non-negative integer counts. Each selection effectively reduces the count by one, lowering the probablity for the next selection. @@ -384,18 +384,18 @@ def sample(self, population, k, *, weights=None): if not isinstance(population, _Sequence): raise TypeError("Population must be a sequence. For dicts or sets, use sorted(d).") n = len(population) - if weights is not None: - cum_weights = list(_accumulate(weights)) - if len(cum_weights) != n: - raise ValueError('The number of weights does not match the population') - total = cum_weights.pop() + if counts is not None: + cum_counts = list(_accumulate(counts)) + if len(cum_counts) != n: + raise ValueError('The number of counts does not match the population') + total = cum_counts.pop() if not isinstance(total, int): - raise TypeError('Weights must be integers') + raise TypeError('Counts must be integers') if total <= 0: - raise ValueError('Total of weights must be greater than zero') + raise ValueError('Total of counts must be greater than zero') selections = sample(range(total), k=k) bisect = _bisect - return [population[bisect(cum_weights, s)] for s in selections] + return [population[bisect(cum_counts, s)] for s in selections] randbelow = self._randbelow if not 0 <= k <= n: raise ValueError("Sample larger than population or is negative") diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py index c34a950890e1c5..3daae2fcd70718 100644 --- a/Lib/test/test_random.py +++ b/Lib/test/test_random.py @@ -161,49 +161,76 @@ def test_sample_on_sets(self): population = {10, 20, 30, 40, 50, 60, 70} self.gen.sample(population, k=5) - def test_sample_with_weights(self): + def test_sample_with_counts(self): sample = self.gen.sample # General case - colors = ['red', 'green', 'blue', 'orange', 'black', 'brown', 'amber'] - weights = [500, 200, 20, 10, 5, 0, 1 ] + colors = ['red', 'green', 'blue', 'orange', 'black', 'brown', 'amber'] + counts = [500, 200, 20, 10, 5, 0, 1 ] k = 700 - summary = Counter(sample(colors, weights=weights, k=k)) + summary = Counter(sample(colors, counts=counts, k=k)) self.assertEqual(sum(summary.values()), k) - for color, weight in zip(colors, weights): + for color, weight in zip(colors, counts): self.assertLessEqual(summary[color], weight) self.assertNotIn('brown', summary) # Case that exhausts the population - k = sum(weights) - summary = Counter(sample(colors, weights=weights, k=k)) + k = sum(counts) + summary = Counter(sample(colors, counts=counts, k=k)) self.assertEqual(sum(summary.values()), k) - for color, weight in zip(colors, weights): + for color, weight in zip(colors, counts): self.assertLessEqual(summary[color], weight) self.assertNotIn('brown', summary) # Case with population size of 1 - summary = Counter(sample(['x'], weights=[10], k=8)) + summary = Counter(sample(['x'], counts=[10], k=8)) self.assertEqual(summary, Counter(x=8)) - # Case with all weights equal. + # Case with all counts equal. nc = len(colors) - summary = Counter(sample(colors, weights=[10]*nc, k=10*nc)) + summary = Counter(sample(colors, counts=[10]*nc, k=10*nc)) self.assertEqual(summary, Counter(10*colors)) # Test error handling with self.assertRaises(TypeError): - sample(['red', 'green', 'blue'], weights=10, k=10) # weights not iterable + sample(['red', 'green', 'blue'], counts=10, k=10) # counts not iterable with self.assertRaises(ValueError): - sample(['red', 'green', 'blue'], weights=[-3, -7, -8], k=2) # weights are negative + sample(['red', 'green', 'blue'], counts=[-3, -7, -8], k=2) # counts are negative with self.assertRaises(ValueError): - sample(['red', 'green', 'blue'], weights=[0, 0, 0], k=2) # weights are zero + sample(['red', 'green', 'blue'], counts=[0, 0, 0], k=2) # counts are zero with self.assertRaises(ValueError): - sample(['red', 'green'], weights=[10, 10], k=21) # population too small + sample(['red', 'green'], counts=[10, 10], k=21) # population too small with self.assertRaises(ValueError): - sample(['red', 'green', 'blue'], weights=[1, 2], k=2) # too few weights + sample(['red', 'green', 'blue'], counts=[1, 2], k=2) # too few counts with self.assertRaises(ValueError): - sample(['red', 'green', 'blue'], weights=[1, 2, 3, 4], k=2) # too many weights + sample(['red', 'green', 'blue'], counts=[1, 2, 3, 4], k=2) # too many counts + + def test_sample_counts_equivalence(self): + # Test the documented strong equivalence to a sample with repeated elements. + # We run this test on random.Random() which makes deteriministic selections + # for a given seed value. + sample = random.sample + seed = random.seed + + colors = ['red', 'green', 'blue', 'orange', 'black', 'amber'] + counts = [500, 200, 20, 10, 5, 1 ] + k = 700 + seed(8675309) + s1 = sample(colors, counts=counts, k=k) + seed(8675309) + expanded = [color for (color, count) in zip(colors, counts) for i in range(count)] + self.assertEqual(len(expanded), sum(counts)) + s2 = sample(expanded, k=k) + self.assertEqual(s1, s2) + + pop = 'abcdefghi' + counts = [10, 9, 8, 7, 6, 5, 4, 3, 2] + seed(8675309) + s1 = ''.join(sample(pop, counts=counts, k=30)) + expanded = ''.join([letter for (letter, count) in zip(pop, counts) for i in range(count)]) + seed(8675309) + s2 = ''.join(sample(expanded, k=30)) + self.assertEqual(s1, s2) def test_choices(self): choices = self.gen.choices diff --git a/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst b/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst index 543ef8a2d7df79..a2e694ac1ad080 100644 --- a/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst +++ b/Misc/NEWS.d/next/Library/2020-05-06-15-36-47.bpo-40541.LlYghL.rst @@ -1 +1 @@ -Added an optional *weights* parameter to random.sample(). +Added an optional *counts* parameter to random.sample(). From 55e948e61b96a280e913617d3d2f5f87df442795 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 7 May 2020 14:07:39 -0700 Subject: [PATCH 09/10] Sync-up docstring with the main docs --- Lib/random.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Lib/random.py b/Lib/random.py index 4692541090421f..a758eb4f743aed 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -340,14 +340,19 @@ def sample(self, population, k, *, counts=None): samples. This allows raffle winners (the sample) to be partitioned into grand prize and second place winners (the subslices). - If counts are given, they must be non-negative integer counts. - Each selection effectively reduces the count by one, lowering - the probablity for the next selection. - Members of the population need not be hashable or unique. If the population contains repeats, then each occurrence is a possible selection in the sample. + Repeated elements can be specified one at a time or with the optional + counts parameter. For example: + + sample(['red', 'blue'], counts=[4, 2], k=5) + + is equivalent to: + + sample(['red', 'red', 'red', 'red', 'blue', 'blue'], k=5) + To choose a sample in a range of integers, use range as an argument. This is especially fast and space efficient for sampling from a large population: sample(range(10000000), 60) From 1f36ad3c5f2a793eb4c67ac8e5685ce66f764413 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 7 May 2020 19:37:52 -0700 Subject: [PATCH 10/10] Fix typo and improve formatting --- Lib/random.py | 9 ++++++--- Lib/test/test_random.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Lib/random.py b/Lib/random.py index a758eb4f743aed..75f70d5d699ed9 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -353,9 +353,12 @@ def sample(self, population, k, *, counts=None): sample(['red', 'red', 'red', 'red', 'blue', 'blue'], k=5) - To choose a sample in a range of integers, use range as an argument. - This is especially fast and space efficient for sampling from a - large population: sample(range(10000000), 60) + To choose a sample from a range of integers, use range() for the + population argument. This is especially fast and space efficient + for sampling from a large population: + + sample(range(10000000), 60) + """ # Sampling without replacement entails tracking either potential diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py index 3daae2fcd70718..a3710f4aa48a68 100644 --- a/Lib/test/test_random.py +++ b/Lib/test/test_random.py @@ -207,7 +207,7 @@ def test_sample_with_counts(self): def test_sample_counts_equivalence(self): # Test the documented strong equivalence to a sample with repeated elements. - # We run this test on random.Random() which makes deteriministic selections + # We run this test on random.Random() which makes deterministic selections # for a given seed value. sample = random.sample seed = random.seed