@@ -192,6 +192,62 @@ def _heapify_max(x):
192192 for i in reversed (range (n // 2 )):
193193 _siftup_max (x , i )
194194
195+
196+ # Algorithm notes for nlargest() and nsmallest()
197+ # ==============================================
198+ #
199+ # Makes just one pass over the data while keeping the n most extreme values
200+ # in a heap. Memory consumption is limited to keeping n values in a list.
201+ #
202+ # Number of comparisons for n random inputs, keeping the k smallest values:
203+ # -----------------------------------------------------------
204+ # Step Comparisons Action
205+ # 1 2*k heapify the first k-inputs
206+ # 2 n-k compare new input elements to top of heap
207+ # 3 k*lg2(k)*(ln(n)-lg(k)) add new extreme values to the heap
208+ # 4 k*lg2(k) final sort of the k most extreme values
209+ #
210+ # n-random inputs k-extreme values number of comparisons % more than min()
211+ # --------------- ---------------- ------------------- -----------------
212+ # 10,000 100 13,634 36.3%
213+ # 100,000 100 105,163 5.2%
214+ # 1,000,000 100 1,006,694 0.7%
215+ #
216+ # Computing the number of comparisons for step 3:
217+ # -----------------------------------------------
218+ # * For the i-th new value from the iterable, the probability of being in the
219+ # k most extreme values is k/i. For example, the probability of the 101st
220+ # value seen being in the 100 most extreme values is 100/101.
221+ # * If the value is a new extreme value, the cost of inserting it into the
222+ # heap is log(k, 2).
223+ # * The probabilty times the cost gives:
224+ # (k/i) * log(k, 2)
225+ # * Summing across the remaining n-k elements gives:
226+ # sum((k/i) * log(k, 2) for xrange(k+1, n+1))
227+ # * This reduces to:
228+ # (H(n) - H(k)) * k * log(k, 2)
229+ # * Where H(n) is the n-th harmonic number estimated by:
230+ # H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
231+ # gamma = 0.5772156649
232+ # http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
233+ # * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
234+ # comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
235+ #
236+ # Worst-case for step 3:
237+ # ---------------------
238+ # In the worst case, the input data is reversed sorted so that every new element
239+ # must be inserted in the heap:
240+ # comparisons = log(k, 2) * (n - k)
241+ #
242+ # Alternative Algorithms
243+ # ----------------------
244+ # Other algorithms were not used because they:
245+ # 1) Took much more auxiliary memory,
246+ # 2) Made multiple passes over the data.
247+ # 3) Made more comparisons in common cases (small k, large n, semi-random input).
248+ # See detailed comparisons at:
249+ # http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
250+
195251def nlargest (n , iterable ):
196252 """Find the n largest elements in a dataset.
197253
0 commit comments