@@ -377,39 +377,59 @@ def shuffle(self, x, random=None, int=int):
377377 def sample (self , population , k , random = None , int = int ):
378378 """Chooses k unique random elements from a population sequence.
379379
380- Returns a new list containing elements from the population. The
381- list itself is in random order so that all sub-slices are also
382- random samples. The original sequence is left undisturbed.
380+ Returns a new list containing elements from the population while
381+ leaving the original population unchanged. The resulting list is
382+ in selection order so that all sub-slices will also be valid random
383+ samples. This allows raffle winners (the sample) to be partitioned
384+ into grand prize and second place winners (the subslices).
383385
384- If the population has repeated elements, then each occurrence is
385- a possible selection in the sample.
386+ Members of the population need not be hashable or unique. If the
387+ population contains repeats, then each occurrence is a possible
388+ selection in the sample.
386389
387- If indices are needed for a large population, use xrange as an
388- argument: sample(xrange(10000000), 60)
390+ To choose a sample in a range of integers, use xrange as an argument.
391+ This is especially fast and space efficient for sampling from a
392+ large population: sample(xrange(10000000), 60)
389393
390394 Optional arg random is a 0-argument function returning a random
391395 float in [0.0, 1.0); by default, the standard random.random.
392396 """
393397
398+ # Sampling without replacement entails tracking either potential
399+ # selections (the pool) or previous selections.
400+
401+ # Pools are stored in lists which provide __getitem__ for selection
402+ # and provide a way to remove selections. But each list.remove()
403+ # rebuilds the entire list, so it is better to rearrange the list,
404+ # placing non-selected elements at the head of the list. Tracking
405+ # the selection pool is only space efficient with small populations.
406+
407+ # Previous selections are stored in dictionaries which provide
408+ # __contains__ for detecting repeat selections. Discarding repeats
409+ # is efficient unless most of the population has already been chosen.
410+ # So, tracking selections is useful when sample sizes are much
411+ # smaller than the total population.
412+
394413 n = len (population )
395414 if not 0 <= k <= n :
396415 raise ValueError , "sample larger than population"
397416 if random is None :
398417 random = self .random
418+ result = [None ] * k
399419 if n < 6 * k : # if n len list takes less space than a k len dict
400- pool = list (population )
401- for i in xrange (n - 1 , n - k - 1 , - 1 ):
402- j = int (random () * (i + 1 ))
403- pool [i ], pool [j ] = pool [j ], pool [i ]
404- return pool [- k :]
405- inorder = [None ] * k
406- selections = {}
407- for i in xrange (k ):
408- j = int (random () * n )
409- while j in selections :
420+ pool = list (population ) # track potential selections
421+ for i in xrange (k ):
422+ j = int (random () * (n - i )) # non-selected at [0,n-i)
423+ result [i ] = pool [j ] # save selected element
424+ pool [j ] = pool [n - i - 1 ] # non-selected to head of list
425+ else :
426+ selected = {} # track previous selections
427+ for i in xrange (k ):
410428 j = int (random () * n )
411- selections [j ] = inorder [i ] = population [j ]
412- return inorder # return selections in the order they were picked
429+ while j in selected : # discard and replace repeats
430+ j = int (random () * n )
431+ result [i ] = selected [j ] = population [j ]
432+ return result # return selections in the order they were picked
413433
414434## -------------------- real-valued distributions -------------------
415435
@@ -756,6 +776,7 @@ def _test_sample(n):
756776 for k in xrange (n + 1 ):
757777 s = sample (population , k )
758778 assert len (dict ([(elem ,True ) for elem in s ])) == len (s ) == k
779+ assert None not in s
759780
760781def _sample_generator (n , k ):
761782 # Return a fixed element from the sample. Validates random ordering.
@@ -787,7 +808,7 @@ def _test(N=2000):
787808 _test_generator (N , 'weibullvariate(1.0, 1.0)' )
788809 _test_generator (N , '_sample_generator(50, 5)' ) # expected s.d.: 14.4
789810 _test_generator (N , '_sample_generator(50, 45)' ) # expected s.d.: 14.4
790- _test_sample (1000 )
811+ _test_sample (500 )
791812
792813 # Test jumpahead.
793814 s = getstate ()
0 commit comments