From d33c1e2682b1964d03b3fc749805fe341c95ca9f Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 12 Mar 2023 12:13:32 -0400 Subject: [PATCH 1/7] Updated TreeUnionFind --- sklearn/cluster/_hdbscan/_tree.pyx | 46 ++++++++++++++---------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 0e493f28379eb..bb2f7922a792b 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -284,38 +284,36 @@ cdef max_lambdas(cnp.ndarray hierarchy): cdef class TreeUnionFind (object): - cdef cnp.ndarray _data_arr - cdef cnp.intp_t[:, ::1] _data - cdef cnp.ndarray is_component + cdef cnp.intp_t[:, ::1] data + cdef cnp.uint8_t[::1] is_component def __init__(self, size): - self._data_arr = np.zeros((size, 2), dtype=np.intp) - self._data_arr.T[0] = np.arange(size) - self._data = self._data_arr + cdef cnp.ndarray[cnp.intp_t, ndim=2] data_arr + data_arr = np.zeros((size, 2), dtype=np.intp) + data_arr.T[0] = np.arange(size) + self.data = data_arr self.is_component = np.ones(size, dtype=bool) - cdef union_(self, cnp.intp_t x, cnp.intp_t y): + @cython.final + cdef void union(self, cnp.intp_t x, cnp.intp_t y): cdef cnp.intp_t x_root = self.find(x) cdef cnp.intp_t y_root = self.find(y) - if self._data[x_root, 1] < self._data[y_root, 1]: - self._data[x_root, 0] = y_root - elif self._data[x_root, 1] > self._data[y_root, 1]: - self._data[y_root, 0] = x_root + if self.data[x_root, 1] < self.data[y_root, 1]: + self.data[x_root, 0] = y_root + elif self.data[x_root, 1] > self.data[y_root, 1]: + self.data[y_root, 0] = x_root else: - self._data[y_root, 0] = x_root - self._data[x_root, 1] += 1 - + self.data[y_root, 0] = x_root + self.data[x_root, 1] += 1 return - cdef find(self, cnp.intp_t x): - if self._data[x, 0] != x: - self._data[x, 0] = self.find(self._data[x, 0]) + @cython.final + cdef cnp.intp_t find(self, cnp.intp_t x): + if self.data[x, 0] != x: + self.data[x, 0] = self.find(self.data[x, 0]) self.is_component[x] = False - return self._data[x, 0] - - cdef cnp.ndarray[cnp.intp_t, ndim=1] components(self): - return self.is_component.nonzero()[0] + return self.data[x, 0] cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( @@ -361,8 +359,8 @@ cpdef cnp.ndarray[cnp.intp_t, ndim=1] labelling_at_cut( cluster = n_samples for row in linkage: if row[2] < cut: - union_find.union_( row[0], cluster) - union_find.union_( row[1], cluster) + union_find.union( row[0], cluster) + union_find.union( row[1], cluster) cluster += 1 cluster_size = np.zeros(cluster, dtype=np.intp) @@ -416,7 +414,7 @@ cdef cnp.ndarray[cnp.intp_t, ndim=1] do_labelling( child = child_array[n] parent = parent_array[n] if child not in clusters: - union_find.union_(parent, child) + union_find.union(parent, child) for n in range(root_cluster): cluster = union_find.find(n) From 90974a2e549a98199905943c1de0cfe5a38ec3f9 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Sun, 12 Mar 2023 12:22:35 -0400 Subject: [PATCH 2/7] Improved syntax --- sklearn/cluster/_hdbscan/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index bb2f7922a792b..7bd5ef0d25646 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -282,7 +282,7 @@ cdef max_lambdas(cnp.ndarray hierarchy): return deaths -cdef class TreeUnionFind (object): +cdef class TreeUnionFind: cdef cnp.intp_t[:, ::1] data cdef cnp.uint8_t[::1] is_component From cf88b9807acc215ea287f016a0ea653a74d931cf Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Mon, 13 Mar 2023 08:54:28 -0400 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/cluster/_hdbscan/_tree.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 7bd5ef0d25646..9d77f48023127 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -282,6 +282,7 @@ cdef max_lambdas(cnp.ndarray hierarchy): return deaths +@final cdef class TreeUnionFind: cdef cnp.intp_t[:, ::1] data @@ -292,7 +293,7 @@ cdef class TreeUnionFind: data_arr = np.zeros((size, 2), dtype=np.intp) data_arr.T[0] = np.arange(size) self.data = data_arr - self.is_component = np.ones(size, dtype=bool) + self.is_component = np.ones(size, dtype=np.uint8) @cython.final cdef void union(self, cnp.intp_t x, cnp.intp_t y): From 650b9579b23571d07be8a65245f1331389bcfc49 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Mar 2023 11:41:08 -0400 Subject: [PATCH 4/7] Implemented reviewer feedback --- sklearn/cluster/_hdbscan/_tree.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 7bd5ef0d25646..3da089369733f 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -288,10 +288,10 @@ cdef class TreeUnionFind: cdef cnp.uint8_t[::1] is_component def __init__(self, size): - cdef cnp.ndarray[cnp.intp_t, ndim=2] data_arr - data_arr = np.zeros((size, 2), dtype=np.intp) - data_arr.T[0] = np.arange(size) - self.data = data_arr + cdef cnp.intp_t idx + self.data = np.zeros((size, 2), dtype=np.intp) + for idx in range(size): + self.data[idx, 0] = idx self.is_component = np.ones(size, dtype=bool) @cython.final From d90a5a3baa5b74bdb170a5c994ca961031386263 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Mar 2023 11:42:37 -0400 Subject: [PATCH 5/7] Fixed merge error --- sklearn/cluster/_hdbscan/_tree.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 3f2751029a8c4..34121835b740e 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -293,9 +293,8 @@ cdef class TreeUnionFind: self.data = np.zeros((size, 2), dtype=np.intp) for idx in range(size): self.data[idx, 0] = idx - self.is_component = np.ones(size, dtype=bool) + self.is_component = np.ones(size, dtype=np.uint8_t) - @cython.final cdef void union(self, cnp.intp_t x, cnp.intp_t y): cdef cnp.intp_t x_root = self.find(x) cdef cnp.intp_t y_root = self.find(y) @@ -309,7 +308,6 @@ cdef class TreeUnionFind: self.data[x_root, 1] += 1 return - @cython.final cdef cnp.intp_t find(self, cnp.intp_t x): if self.data[x, 0] != x: self.data[x, 0] = self.find(self.data[x, 0]) From 2c20d3602d3b9742ac7813b4a77b1c634fed9856 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Mar 2023 11:59:36 -0400 Subject: [PATCH 6/7] Fixed final decorator --- sklearn/cluster/_hdbscan/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index 34121835b740e..ed7be94825e06 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -282,7 +282,7 @@ cdef max_lambdas(cnp.ndarray hierarchy): return deaths -@final +@cython.final cdef class TreeUnionFind: cdef cnp.intp_t[:, ::1] data From d12e6ce0b0bdba26099a3a53bb90a915f901708b Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Tue, 14 Mar 2023 12:00:26 -0400 Subject: [PATCH 7/7] Typo --- sklearn/cluster/_hdbscan/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx index ed7be94825e06..6e4df6cf12592 100644 --- a/sklearn/cluster/_hdbscan/_tree.pyx +++ b/sklearn/cluster/_hdbscan/_tree.pyx @@ -293,7 +293,7 @@ cdef class TreeUnionFind: self.data = np.zeros((size, 2), dtype=np.intp) for idx in range(size): self.data[idx, 0] = idx - self.is_component = np.ones(size, dtype=np.uint8_t) + self.is_component = np.ones(size, dtype=np.uint8) cdef void union(self, cnp.intp_t x, cnp.intp_t y): cdef cnp.intp_t x_root = self.find(x)