From bda0c10f063c6d60e737072b2a7f0187cf7669ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 18 Sep 2018 11:24:11 +0200 Subject: [PATCH 1/5] move core_distances_computation in a helper --- sklearn/cluster/optics_.py | 43 +++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 19b6a79f45994..785e45578df7e 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -14,6 +14,7 @@ import numpy as np from ..utils import check_array +from ..utils import gen_batches, get_chunk_n_rows from ..utils.validation import check_is_fitted from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin @@ -395,8 +396,6 @@ def fit(self, X, y=None): # Start all points as 'unprocessed' ## self.reachability_ = np.empty(n_samples) self.reachability_.fill(np.inf) - self.core_distances_ = np.empty(n_samples) - self.core_distances_.fill(np.nan) # Start all points as noise ## self.labels_ = np.full(n_samples, -1, dtype=int) @@ -407,9 +406,7 @@ def fit(self, X, y=None): n_jobs=self.n_jobs) nbrs.fit(X) - self.core_distances_[:] = nbrs.kneighbors(X, - self.min_samples)[0][:, -1] - + self.core_distances_ = self._calculate_core_distances_(X, nbrs) self.ordering_ = self._calculate_optics_order(X, nbrs) indices_, self.labels_ = _extract_optics(self.ordering_, @@ -425,6 +422,42 @@ def fit(self, X, y=None): # OPTICS helper functions + def _calculate_core_distances_(self, X, nbrs, working_memory=None): + """Compute the k-th nearest neighbor of each sample + + Equivalent to nbrs.kneighbors(X, self.min_samples)[0][:, -1] + but with more memory efficiency. + + Parameters + ---------- + X : array, shape (n_samples, n_features) + The data. + nbrs : NearestNeighbors instance + The fitted nearest neeighbors estimator + working_memory : int, optional + The sought maximum memory for temporary distance matrix chunks. + When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. + + Returns + ------- + core_distances : array, shape (n_samples,) + Distance at which each sample becomes a core point. + Points which will never be core have a distance of inf. + """ + n_samples = len(X) + core_distances = np.empty(n_samples) + core_distances.fill(np.nan) + + chunk_n_rows = get_chunk_n_rows(row_bytes=8 * self.min_samples, + max_n_rows=n_samples, + working_memory=working_memory) + slices = gen_batches(n_samples, chunk_n_rows) + for sl in slices: + core_distances[sl] = nbrs.kneighbors( + X[sl], self.min_samples)[0][:, -1] + return core_distances + def _calculate_optics_order(self, X, nbrs): # Main OPTICS loop. Not parallelizable. The order that entries are # written to the 'ordering_' list is important! From fdb527284861b85d447545da2fadeac5571af2e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 18 Sep 2018 11:43:05 +0200 Subject: [PATCH 2/5] typo --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 785e45578df7e..840f0f7959be2 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -433,7 +433,7 @@ def _calculate_core_distances_(self, X, nbrs, working_memory=None): X : array, shape (n_samples, n_features) The data. nbrs : NearestNeighbors instance - The fitted nearest neeighbors estimator + The fitted nearest neighbors estimator. working_memory : int, optional The sought maximum memory for temporary distance matrix chunks. When None (default), the value of From 7cb65a4ed2cc0dfd53d9dedda39c15cc8311f72c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 18 Sep 2018 14:59:54 +0200 Subject: [PATCH 3/5] renaming --- sklearn/cluster/optics_.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 840f0f7959be2..e78935587e90f 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -406,7 +406,7 @@ def fit(self, X, y=None): n_jobs=self.n_jobs) nbrs.fit(X) - self.core_distances_ = self._calculate_core_distances_(X, nbrs) + self.core_distances_ = self._compute_core_distances_(X, nbrs) self.ordering_ = self._calculate_optics_order(X, nbrs) indices_, self.labels_ = _extract_optics(self.ordering_, @@ -422,7 +422,7 @@ def fit(self, X, y=None): # OPTICS helper functions - def _calculate_core_distances_(self, X, nbrs, working_memory=None): + def _compute_core_distances_(self, X, neighbors, working_memory=None): """Compute the k-th nearest neighbor of each sample Equivalent to nbrs.kneighbors(X, self.min_samples)[0][:, -1] @@ -432,7 +432,7 @@ def _calculate_core_distances_(self, X, nbrs, working_memory=None): ---------- X : array, shape (n_samples, n_features) The data. - nbrs : NearestNeighbors instance + neighbors : NearestNeighbors instance The fitted nearest neighbors estimator. working_memory : int, optional The sought maximum memory for temporary distance matrix chunks. @@ -454,7 +454,7 @@ def _calculate_core_distances_(self, X, nbrs, working_memory=None): working_memory=working_memory) slices = gen_batches(n_samples, chunk_n_rows) for sl in slices: - core_distances[sl] = nbrs.kneighbors( + core_distances[sl] = neighbors.kneighbors( X[sl], self.min_samples)[0][:, -1] return core_distances From 13a42bdbe41de428707801564d699882230a7c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 18 Sep 2018 15:01:21 +0200 Subject: [PATCH 4/5] FIX memory consumption by sample --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index e78935587e90f..7f5d3cf3f5e81 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -449,7 +449,7 @@ def _compute_core_distances_(self, X, neighbors, working_memory=None): core_distances = np.empty(n_samples) core_distances.fill(np.nan) - chunk_n_rows = get_chunk_n_rows(row_bytes=8 * self.min_samples, + chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self.min_samples, max_n_rows=n_samples, working_memory=working_memory) slices = gen_batches(n_samples, chunk_n_rows) From 40ecd64394558fe3cf7971982d4cf7e773e34c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 18 Sep 2018 19:26:23 +0200 Subject: [PATCH 5/5] DOC minor --- sklearn/cluster/optics_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 7f5d3cf3f5e81..9b10912cca970 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -425,7 +425,7 @@ def fit(self, X, y=None): def _compute_core_distances_(self, X, neighbors, working_memory=None): """Compute the k-th nearest neighbor of each sample - Equivalent to nbrs.kneighbors(X, self.min_samples)[0][:, -1] + Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1] but with more memory efficiency. Parameters