From cc31a2a39faf1e63a0f34d633a0c6d74dcd1daff Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Sat, 5 Nov 2022 15:52:50 -0400
Subject: [PATCH 01/11] Initial cleanup

---
 sklearn/cluster/_hdbscan/_linkage.pyx | 125 ++++++++++++--------------
 1 file changed, 56 insertions(+), 69 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index 0d40191f2c94e..068d6c069482e 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -3,8 +3,9 @@
 #          Steve Astels <sastels@gmail.com>
 # License: 3-clause BSD
 
-import numpy as np
 cimport numpy as cnp
+
+import numpy as np
 import cython
 
 from libc.float cimport DBL_MAX
@@ -14,106 +15,92 @@ from ...cluster._hierarchical_fast cimport UnionFind
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 from ...utils._typedefs import ITYPE, DTYPE
 
-cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_distance_matrix(
-    cnp.ndarray[cnp.double_t, ndim=2] distance_matrix
+cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_distance_matrix(
+    cnp.ndarray[cnp.float64_t, ndim=2] distance_matrix
 ):
 
     cdef:
         cnp.ndarray[cnp.intp_t, ndim=1] node_labels
         cnp.ndarray[cnp.intp_t, ndim=1] current_labels
-        cnp.ndarray[cnp.double_t, ndim=1] current_distances
-        cnp.ndarray[cnp.double_t, ndim=1] left
-        cnp.ndarray[cnp.double_t, ndim=1] right
-        cnp.ndarray[cnp.double_t, ndim=2] result
+        cnp.ndarray[cnp.float64_t, ndim=1] current_distances, left, right
+        cnp.ndarray[cnp.float64_t, ndim=2] result
 
         cnp.ndarray label_filter
 
-        cnp.intp_t current_node
-        cnp.intp_t new_node_index
-        cnp.intp_t new_node
-        cnp.intp_t i
+        cnp.intp_t n_samples = distance_matrix.shape[0]
+        cnp.intp_t current_node, new_node_index, new_node, i
 
-    result = np.zeros((distance_matrix.shape[0] - 1, 3))
-    node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp)
+    result = np.zeros((n_samples - 1, 3))
+    node_labels = np.arange(n_samples, dtype=np.intp)
     current_node = 0
-    current_distances = np.infty * np.ones(distance_matrix.shape[0])
+    current_distances = np.infty * np.ones(n_samples)
     current_labels = node_labels
-    for i in range(1, node_labels.shape[0]):
+    for i in range(1, n_samples):
         label_filter = current_labels != current_node
         current_labels = current_labels[label_filter]
         left = current_distances[label_filter]
         right = distance_matrix[current_node][current_labels]
-        current_distances = np.where(left < right, left, right)
+        current_distances = np.minimum(left, right)
 
         new_node_index = np.argmin(current_distances)
         new_node = current_labels[new_node_index]
-        result[i - 1, 0] = <double> current_node
-        result[i - 1, 1] = <double> new_node
+        result[i - 1, 0] = <cnp.float64_t> current_node
+        result[i - 1, 1] = <cnp.float64_t> new_node
         result[i - 1, 2] = current_distances[new_node_index]
         current_node = new_node
 
     return result
 
 
-cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix(
-    cnp.ndarray[cnp.double_t, ndim=2, mode='c'] raw_data,
-    cnp.ndarray[cnp.double_t, ndim=1, mode='c'] core_distances,
+cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix(
+    cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] raw_data,
+    cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] core_distances,
     DistanceMetric dist_metric,
-    cnp.double_t alpha=1.0
+    cnp.float64_t alpha=1.0
 ):
 
     cdef:
-        cnp.ndarray[cnp.double_t, ndim=1] current_distances_arr
-        cnp.ndarray[cnp.double_t, ndim=1] current_sources_arr
+        cnp.ndarray[cnp.float64_t, ndim=1] current_distances_arr
+        cnp.ndarray[cnp.float64_t, ndim=1] current_sources_arr
         cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr
-        cnp.ndarray[cnp.double_t, ndim=2] result_arr
+        cnp.ndarray[cnp.float64_t, ndim=2] result_arr
 
-        cnp.double_t * current_distances
-        cnp.double_t * current_sources
-        cnp.double_t * current_core_distances
-        cnp.double_t * raw_data_ptr
+        cnp.float64_t * current_distances
+        cnp.float64_t * current_sources
+        cnp.float64_t * current_core_distances
+        cnp.float64_t * raw_data_ptr
         cnp.int8_t * in_tree
-        cnp.double_t[:, ::1] raw_data_view
-        cnp.double_t[:, ::1] result
+        cnp.float64_t[:, ::1] raw_data_view
+        cnp.float64_t[:, ::1] result
 
         cnp.ndarray label_filter
 
-        cnp.intp_t current_node
-        cnp.intp_t source_node
-        cnp.intp_t right_node
-        cnp.intp_t left_node
-        cnp.intp_t new_node
-        cnp.intp_t i
-        cnp.intp_t j
-        cnp.intp_t dim
-        cnp.intp_t num_features
-
-        double current_node_core_distance
-        double right_value
-        double left_value
-        double core_value
-        double new_distance
-
-    dim = raw_data.shape[0]
+        cnp.intp_t current_node, source_node, right_node, left_node, new_node
+        cnp.intp_t i, j, n_samples, num_features
+
+        cnp.float64_t current_node_core_distance, new_distance
+        cnp.float64_t right_value, left_value, core_value
+
+    n_samples = raw_data.shape[0]
     num_features = raw_data.shape[1]
 
-    raw_data_view = (<cnp.double_t[:raw_data.shape[0], :raw_data.shape[1]:1]> (
-        <cnp.double_t *> raw_data.data))
-    raw_data_ptr = (<cnp.double_t *> &raw_data_view[0, 0])
+    raw_data_view = (<cnp.float64_t[:n_samples, :num_features:1]> (
+        <cnp.float64_t *> raw_data.data))
+    raw_data_ptr = (<cnp.float64_t *> &raw_data_view[0, 0])
 
-    result_arr = np.zeros((dim - 1, 3))
-    in_tree_arr = np.zeros(dim, dtype=np.int8)
+    result_arr = np.zeros((n_samples - 1, 3))
+    in_tree_arr = np.zeros(n_samples, dtype=np.int8)
     current_node = 0
-    current_distances_arr = np.infty * np.ones(dim)
-    current_sources_arr = np.ones(dim)
+    current_distances_arr = np.infty * np.ones(n_samples)
+    current_sources_arr = np.ones(n_samples)
 
-    result = (<cnp.double_t[:dim - 1, :3:1]> (<cnp.double_t *> result_arr.data))
+    result = (<cnp.float64_t[:n_samples - 1, :3:1]> (<cnp.float64_t *> result_arr.data))
     in_tree = (<cnp.int8_t *> in_tree_arr.data)
-    current_distances = (<cnp.double_t *> current_distances_arr.data)
-    current_sources = (<cnp.double_t *> current_sources_arr.data)
-    current_core_distances = (<cnp.double_t *> core_distances.data)
+    current_distances = (<cnp.float64_t *> current_distances_arr.data)
+    current_sources = (<cnp.float64_t *> current_sources_arr.data)
+    current_core_distances = (<cnp.float64_t *> core_distances.data)
 
-    for i in range(1, dim):
+    for i in range(1, n_samples):
 
         in_tree[current_node] = 1
 
@@ -123,7 +110,7 @@ cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix(
         source_node = 0
         new_node = 0
 
-        for j in range(dim):
+        for j in range(n_samples):
             if in_tree[j]:
                 continue
 
@@ -169,26 +156,26 @@ cpdef cnp.ndarray[cnp.double_t, ndim=2] mst_from_data_matrix(
                     source_node = right_source
                     new_node = j
 
-        result[i - 1, 0] = <double> source_node
-        result[i - 1, 1] = <double> new_node
+        result[i - 1, 0] = <cnp.float64_t> source_node
+        result[i - 1, 1] = <cnp.float64_t> new_node
         result[i - 1, 2] = new_distance
         current_node = new_node
 
     return result_arr
 
 @cython.wraparound(True)
-cpdef cnp.ndarray[cnp.double_t, ndim=2] label(cnp.double_t[:,:] L):
+cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(cnp.float64_t[:,:] L):
 
     cdef:
-        cnp.ndarray[cnp.double_t, ndim=2] result_arr
-        cnp.double_t[:, ::1] result
+        cnp.ndarray[cnp.float64_t, ndim=2] result_arr
+        cnp.float64_t[:, ::1] result
 
         cnp.intp_t N, a, aa, b, bb, index
-        cnp.double_t delta
+        cnp.float64_t delta
 
     result_arr = np.zeros((L.shape[0], L.shape[1] + 1))
-    result = (<cnp.double_t[:L.shape[0], :4:1]> (
-        <cnp.double_t *> result_arr.data))
+    result = (<cnp.float64_t[:L.shape[0], :4:1]> (
+        <cnp.float64_t *> result_arr.data))
     N = L.shape[0] + 1
     U = UnionFind(N)
 

From 4dcfe8e97216ecaf61986c43f78a2c712924bb8e Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Mon, 7 Nov 2022 19:47:55 -0500
Subject: [PATCH 02/11] WIP partial implementation of custom struct for MST

---
 sklearn/cluster/_hdbscan/_linkage.pxd | 14 ++++++++++++++
 sklearn/cluster/_hdbscan/_linkage.pyx | 13 ++++++-------
 sklearn/cluster/_hdbscan/hdbscan.py   | 13 +++++++++----
 3 files changed, 29 insertions(+), 11 deletions(-)
 create mode 100644 sklearn/cluster/_hdbscan/_linkage.pxd

diff --git a/sklearn/cluster/_hdbscan/_linkage.pxd b/sklearn/cluster/_hdbscan/_linkage.pxd
new file mode 100644
index 0000000000000..a67afdfdaab69
--- /dev/null
+++ b/sklearn/cluster/_hdbscan/_linkage.pxd
@@ -0,0 +1,14 @@
+cimport numpy as cnp
+import numpy as np
+
+# Numpy structured dtype representing a single ordered edge in Prim's algorithm
+MST_edge_dtype = np.dtype([
+    ("current_node", np.intp),
+    ("next_node", np.intp),
+    ("distance", np.float64),
+])
+
+ctypedef struct MST_edge_t:
+    cnp.intp_t current_node
+    cnp.intp_t next_node
+    cnp.float64_t distance
diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index 068d6c069482e..bd45175d06272 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -15,22 +15,21 @@ from ...cluster._hierarchical_fast cimport UnionFind
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 from ...utils._typedefs import ITYPE, DTYPE
 
-cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_distance_matrix(
+cpdef cnp.ndarray[MST_edge_t, ndim=2] mst_from_distance_matrix(
     cnp.ndarray[cnp.float64_t, ndim=2] distance_matrix
 ):
-
     cdef:
         cnp.ndarray[cnp.intp_t, ndim=1] node_labels
         cnp.ndarray[cnp.intp_t, ndim=1] current_labels
         cnp.ndarray[cnp.float64_t, ndim=1] current_distances, left, right
-        cnp.ndarray[cnp.float64_t, ndim=2] result
+        cnp.ndarray[MST_edge_t, ndim=1] result
 
         cnp.ndarray label_filter
 
         cnp.intp_t n_samples = distance_matrix.shape[0]
         cnp.intp_t current_node, new_node_index, new_node, i
 
-    result = np.zeros((n_samples - 1, 3))
+    result = np.empty(n_samples - 1, dtype=MST_edge_dtype)
     node_labels = np.arange(n_samples, dtype=np.intp)
     current_node = 0
     current_distances = np.infty * np.ones(n_samples)
@@ -44,9 +43,9 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_distance_matrix(
 
         new_node_index = np.argmin(current_distances)
         new_node = current_labels[new_node_index]
-        result[i - 1, 0] = <cnp.float64_t> current_node
-        result[i - 1, 1] = <cnp.float64_t> new_node
-        result[i - 1, 2] = current_distances[new_node_index]
+        result[i - 1].current_node = current_node
+        result[i - 1].next_node = new_node
+        result[i - 1].distance = current_distances[new_node_index]
         current_node = new_node
 
     return result
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 79beead943898..d90a3c4279ee9 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -21,7 +21,12 @@
 from ...neighbors import BallTree, KDTree, NearestNeighbors
 from ...utils._param_validation import Interval, StrOptions
 from ...utils.validation import _assert_all_finite
-from ._linkage import label, mst_from_distance_matrix, mst_from_data_matrix
+from ._linkage import (
+    label,
+    mst_from_distance_matrix,
+    mst_from_data_matrix,
+    MST_edge_dtype,
+)
 from ._reachability import mutual_reachability
 from ._tree import compute_stability, condense_tree, get_clusters, labelling_at_cut
 
@@ -100,8 +105,8 @@ def _tree_to_labels(
 
 def _process_mst(min_spanning_tree):
     # Sort edges of the min_spanning_tree by weight
-    row_order = np.argsort(min_spanning_tree.T[2])
-    min_spanning_tree = min_spanning_tree[row_order, :]
+    row_order = np.argsort(min_spanning_tree["distance"])
+    min_spanning_tree = min_spanning_tree[row_order]
     # Convert edge list into standard hierarchical clustering format
     return label(min_spanning_tree)
 
@@ -141,7 +146,7 @@ def _hdbscan_brute(
         mutual_reachability_, min_samples=min_samples, sparse=sparse
     )
     # Warn if the MST couldn't be constructed around the missing distances
-    if np.isinf(min_spanning_tree.T[2]).any():
+    if np.isinf(min_spanning_tree["distance"]).any():
         warn(
             "The minimum spanning tree contains edge weights with value "
             "infinity. Potentially, you are missing too many distances "

From 7a07548882526e0be3e61780b674e2c027896cc5 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 8 Nov 2022 16:35:03 -0500
Subject: [PATCH 03/11] Refactor including new struct for simplification

---
 sklearn/cluster/_hdbscan/_linkage.pxd |  14 --
 sklearn/cluster/_hdbscan/_linkage.pyx | 205 +++++++++++++-------------
 sklearn/cluster/_hdbscan/hdbscan.py   |   9 +-
 3 files changed, 106 insertions(+), 122 deletions(-)
 delete mode 100644 sklearn/cluster/_hdbscan/_linkage.pxd

diff --git a/sklearn/cluster/_hdbscan/_linkage.pxd b/sklearn/cluster/_hdbscan/_linkage.pxd
deleted file mode 100644
index a67afdfdaab69..0000000000000
--- a/sklearn/cluster/_hdbscan/_linkage.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-cimport numpy as cnp
-import numpy as np
-
-# Numpy structured dtype representing a single ordered edge in Prim's algorithm
-MST_edge_dtype = np.dtype([
-    ("current_node", np.intp),
-    ("next_node", np.intp),
-    ("distance", np.float64),
-])
-
-ctypedef struct MST_edge_t:
-    cnp.intp_t current_node
-    cnp.intp_t next_node
-    cnp.float64_t distance
diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index bd45175d06272..1fce979662b03 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -15,43 +15,56 @@ from ...cluster._hierarchical_fast cimport UnionFind
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
 from ...utils._typedefs import ITYPE, DTYPE
 
-cpdef cnp.ndarray[MST_edge_t, ndim=2] mst_from_distance_matrix(
-    cnp.ndarray[cnp.float64_t, ndim=2] distance_matrix
+# Numpy structured dtype representing a single ordered edge in Prim's algorithm
+MST_edge_dtype = np.dtype([
+    ("current_node", np.intp),
+    ("next_node", np.intp),
+    ("distance", np.float64),
+])
+
+ctypedef struct MST_edge_t:
+    cnp.intp_t current_node
+    cnp.intp_t next_node
+    cnp.float64_t distance
+
+# TODO add contiguous constraint where possible
+cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_mutual_reachability(
+    cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability
 ):
     cdef:
         cnp.ndarray[cnp.intp_t, ndim=1] node_labels
         cnp.ndarray[cnp.intp_t, ndim=1] current_labels
-        cnp.ndarray[cnp.float64_t, ndim=1] current_distances, left, right
-        cnp.ndarray[MST_edge_t, ndim=1] result
+        cnp.ndarray[cnp.float64_t, ndim=1] min_reachability, left, right
+        cnp.ndarray[MST_edge_t, ndim=1] mst
 
-        cnp.ndarray label_filter
+        cnp.ndarray[cnp.uint8_t] label_filter
 
-        cnp.intp_t n_samples = distance_matrix.shape[0]
+        cnp.intp_t n_samples = mutual_reachability.shape[0]
         cnp.intp_t current_node, new_node_index, new_node, i
 
-    result = np.empty(n_samples - 1, dtype=MST_edge_dtype)
+    mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
     node_labels = np.arange(n_samples, dtype=np.intp)
     current_node = 0
-    current_distances = np.infty * np.ones(n_samples)
+    min_reachability = np.infty * np.ones(n_samples)
     current_labels = node_labels
     for i in range(1, n_samples):
         label_filter = current_labels != current_node
         current_labels = current_labels[label_filter]
-        left = current_distances[label_filter]
-        right = distance_matrix[current_node][current_labels]
-        current_distances = np.minimum(left, right)
+        left = min_reachability[label_filter]
+        right = mutual_reachability[current_node][current_labels]
+        min_reachability = np.minimum(left, right)
 
-        new_node_index = np.argmin(current_distances)
+        new_node_index = np.argmin(min_reachability)
         new_node = current_labels[new_node_index]
-        result[i - 1].current_node = current_node
-        result[i - 1].next_node = new_node
-        result[i - 1].distance = current_distances[new_node_index]
+        mst[i - 1].current_node = current_node
+        mst[i - 1].next_node = new_node
+        mst[i - 1].distance = min_reachability[new_node_index]
         current_node = new_node
 
-    return result
+    return mst
 
 
-cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix(
+cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
     cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] raw_data,
     cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] core_distances,
     DistanceMetric dist_metric,
@@ -59,53 +72,39 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix(
 ):
 
     cdef:
-        cnp.ndarray[cnp.float64_t, ndim=1] current_distances_arr
-        cnp.ndarray[cnp.float64_t, ndim=1] current_sources_arr
-        cnp.ndarray[cnp.int8_t, ndim=1] in_tree_arr
-        cnp.ndarray[cnp.float64_t, ndim=2] result_arr
+        cnp.int8_t[::1] in_tree
+        cnp.float64_t[::1] min_reachability, current_sources
+        cnp.float64_t[::1] current_core_distances = core_distances
+        cnp.float64_t[:, ::1] raw_data_view = raw_data
+        cnp.ndarray[MST_edge_t, ndim=1] mst
+        cnp.ndarray[cnp.float64_t, ndim=2] mst_arr
 
-        cnp.float64_t * current_distances
-        cnp.float64_t * current_sources
-        cnp.float64_t * current_core_distances
-        cnp.float64_t * raw_data_ptr
-        cnp.int8_t * in_tree
-        cnp.float64_t[:, ::1] raw_data_view
-        cnp.float64_t[:, ::1] result
-
-        cnp.ndarray label_filter
+        cnp.ndarray[cnp.uint8_t] label_filter
 
         cnp.intp_t current_node, source_node, right_node, left_node, new_node
         cnp.intp_t i, j, n_samples, num_features
 
-        cnp.float64_t current_node_core_distance, new_distance
-        cnp.float64_t right_value, left_value, core_value
+        cnp.float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
+        cnp.float64_t next_node_min_reach, pair_distance, next_node_core_dist
 
     n_samples = raw_data.shape[0]
     num_features = raw_data.shape[1]
 
-    raw_data_view = (<cnp.float64_t[:n_samples, :num_features:1]> (
-        <cnp.float64_t *> raw_data.data))
-    raw_data_ptr = (<cnp.float64_t *> &raw_data_view[0, 0])
+    mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
 
-    result_arr = np.zeros((n_samples - 1, 3))
-    in_tree_arr = np.zeros(n_samples, dtype=np.int8)
-    current_node = 0
-    current_distances_arr = np.infty * np.ones(n_samples)
-    current_sources_arr = np.ones(n_samples)
+    in_tree = np.zeros(n_samples, dtype=np.int8)
+    min_reachability = np.infty * np.ones(n_samples)
+    current_sources = np.ones(n_samples)
 
-    result = (<cnp.float64_t[:n_samples - 1, :3:1]> (<cnp.float64_t *> result_arr.data))
-    in_tree = (<cnp.int8_t *> in_tree_arr.data)
-    current_distances = (<cnp.float64_t *> current_distances_arr.data)
-    current_sources = (<cnp.float64_t *> current_sources_arr.data)
-    current_core_distances = (<cnp.float64_t *> core_distances.data)
+    current_node = 0
 
     for i in range(1, n_samples):
 
         in_tree[current_node] = 1
 
-        current_node_core_distance = current_core_distances[current_node]
+        current_node_core_dist = current_core_distances[current_node]
 
-        new_distance = DBL_MAX
+        new_reachability = DBL_MAX
         source_node = 0
         new_node = 0
 
@@ -113,84 +112,82 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2] mst_from_data_matrix(
             if in_tree[j]:
                 continue
 
-            right_value = current_distances[j]
-            right_source = current_sources[j]
+            next_node_min_reach = min_reachability[j]
+            next_node_source = current_sources[j]
 
-            left_value = dist_metric.dist(&raw_data_ptr[num_features *
-                                                        current_node],
-                                          &raw_data_ptr[num_features * j],
-                                          num_features)
-            left_source = current_node
+            pair_distance = dist_metric.dist(
+                &raw_data_view[current_node, 0],
+                &raw_data_view[j, 0],
+                num_features
+            )
 
             if alpha != 1.0:
-                left_value /= alpha
-
-            core_value = core_distances[j]
-            if (current_node_core_distance > right_value or
-                    core_value > right_value or
-                    left_value > right_value):
-                if right_value < new_distance:
-                    new_distance = right_value
-                    source_node = right_source
+                pair_distance /= alpha
+
+            next_node_core_dist = core_distances[j]
+            mutual_reachability_distance = max(
+                current_node_core_dist,
+                next_node_core_dist,
+                pair_distance
+            )
+            if mutual_reachability_distance > next_node_min_reach:
+                if next_node_min_reach < new_reachability:
+                    new_reachability = next_node_min_reach
+                    source_node = next_node_source
                     new_node = j
                 continue
 
-            if core_value > current_node_core_distance:
-                if core_value > left_value:
-                    left_value = core_value
-            else:
-                if current_node_core_distance > left_value:
-                    left_value = current_node_core_distance
-
-            if left_value < right_value:
-                current_distances[j] = left_value
-                current_sources[j] = left_source
-                if left_value < new_distance:
-                    new_distance = left_value
-                    source_node = left_source
+            if mutual_reachability_distance < next_node_min_reach:
+                min_reachability[j] = mutual_reachability_distance
+                current_sources[j] = current_node
+                if mutual_reachability_distance < new_reachability:
+                    new_reachability = mutual_reachability_distance
+                    source_node = current_node
                     new_node = j
             else:
-                if right_value < new_distance:
-                    new_distance = right_value
-                    source_node = right_source
+                if next_node_min_reach < new_reachability:
+                    new_reachability = next_node_min_reach
+                    source_node = next_node_source
                     new_node = j
 
-        result[i - 1, 0] = <cnp.float64_t> source_node
-        result[i - 1, 1] = <cnp.float64_t> new_node
-        result[i - 1, 2] = new_distance
+        mst[i - 1].current_node = source_node
+        mst[i - 1].next_node = new_node
+        mst[i - 1].distance = new_reachability
         current_node = new_node
 
-    return result_arr
+    return mst
 
 @cython.wraparound(True)
-cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(cnp.float64_t[:,:] L):
+cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(MST_edge_t[:] mst):
 
     cdef:
-        cnp.ndarray[cnp.float64_t, ndim=2] result_arr
-        cnp.float64_t[:, ::1] result
+        cnp.ndarray[cnp.float64_t, ndim=2] single_linkage
 
-        cnp.intp_t N, a, aa, b, bb, index
-        cnp.float64_t delta
+        # Note mst.shape[0] is one fewer than the number of samples
+        cnp.intp_t n_samples = mst.shape[0] + 1
+        cnp.intp_t current_node_ancestor, next_node_ancestor
+        cnp.intp_t current_node, next_node, index
+        cnp.float64_t distance
 
-    result_arr = np.zeros((L.shape[0], L.shape[1] + 1))
-    result = (<cnp.float64_t[:L.shape[0], :4:1]> (
-        <cnp.float64_t *> result_arr.data))
-    N = L.shape[0] + 1
-    U = UnionFind(N)
+    single_linkage = np.zeros((n_samples - 1, 4))
+    U = UnionFind(n_samples)
 
-    for index in range(L.shape[0]):
+    for i in range(n_samples - 1):
 
-        a = <cnp.intp_t> L[index, 0]
-        b = <cnp.intp_t> L[index, 1]
-        delta = L[index, 2]
+        current_node = mst[i].current_node
+        next_node = mst[i].next_node
+        distance = mst[i].distance
 
-        aa, bb = U.fast_find(a), U.fast_find(b)
+        current_node_ancestor, next_node_ancestor = (
+            U.fast_find(current_node),
+            U.fast_find(next_node)
+        )
 
-        result[index][0] = aa
-        result[index][1] = bb
-        result[index][2] = delta
-        result[index][3] = U.size[aa] + U.size[bb]
+        single_linkage[i][0] = current_node_ancestor
+        single_linkage[i][1] = next_node_ancestor
+        single_linkage[i][2] = distance
+        single_linkage[i][3] = U.size[current_node_ancestor] + U.size[next_node_ancestor]
 
-        U.union(aa, bb)
+        U.union(current_node_ancestor, next_node_ancestor)
 
-    return result_arr
+    return single_linkage
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index d90a3c4279ee9..95c114dafb070 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -23,7 +23,7 @@
 from ...utils.validation import _assert_all_finite
 from ._linkage import (
     label,
-    mst_from_distance_matrix,
+    mst_from_mutual_reachability,
     mst_from_data_matrix,
     MST_edge_dtype,
 )
@@ -53,7 +53,7 @@
 
 def _brute_mst(mutual_reachability, min_samples, sparse=False):
     if not sparse:
-        return mst_from_distance_matrix(mutual_reachability)
+        return mst_from_mutual_reachability(mutual_reachability)
 
     # Check connected component on mutual reachability
     # If more than one component, it means that even if the distance matrix X
@@ -75,7 +75,9 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False):
     # Compute the minimum spanning tree for the sparse graph
     sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability)
     rows, cols = sparse_min_spanning_tree.nonzero()
-    return np.vstack((rows, cols, sparse_min_spanning_tree.data)).T
+    mst = np.vstack((rows, cols, sparse_min_spanning_tree.data))
+    mst = np.core.records.fromarrays(mst, dtype=MST_edge_dtype, shape=(mst.shape[1],))
+    return mst
 
 
 def _tree_to_labels(
@@ -154,7 +156,6 @@ def _hdbscan_brute(
             "size.",
             UserWarning,
         )
-
     return _process_mst(min_spanning_tree)
 
 

From 9f4fbdfa9eae3f5feb1b1f5c2961065a91f74637 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 8 Nov 2022 16:40:04 -0500
Subject: [PATCH 04/11] Added contiguous specification where applicable

---
 sklearn/cluster/_hdbscan/_linkage.pyx | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index 1fce979662b03..eb5d3191c61ef 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -22,22 +22,23 @@ MST_edge_dtype = np.dtype([
     ("distance", np.float64),
 ])
 
-ctypedef struct MST_edge_t:
+# Packed shouldn't make a difference since they're all 8-byte quantities,
+# but it's included just to be safe.
+ctypedef packed struct MST_edge_t:
     cnp.intp_t current_node
     cnp.intp_t next_node
     cnp.float64_t distance
 
-# TODO add contiguous constraint where possible
-cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_mutual_reachability(
+cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
     cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability
 ):
     cdef:
-        cnp.ndarray[cnp.intp_t, ndim=1] node_labels
-        cnp.ndarray[cnp.intp_t, ndim=1] current_labels
-        cnp.ndarray[cnp.float64_t, ndim=1] min_reachability, left, right
-        cnp.ndarray[MST_edge_t, ndim=1] mst
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] node_labels
+        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] current_labels
+        cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right
+        cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
 
-        cnp.ndarray[cnp.uint8_t] label_filter
+        cnp.ndarray[cnp.uint8_t, mode='c'] label_filter
 
         cnp.intp_t n_samples = mutual_reachability.shape[0]
         cnp.intp_t current_node, new_node_index, new_node, i
@@ -76,10 +77,10 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
         cnp.float64_t[::1] min_reachability, current_sources
         cnp.float64_t[::1] current_core_distances = core_distances
         cnp.float64_t[:, ::1] raw_data_view = raw_data
-        cnp.ndarray[MST_edge_t, ndim=1] mst
-        cnp.ndarray[cnp.float64_t, ndim=2] mst_arr
+        cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
+        cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] mst_arr
 
-        cnp.ndarray[cnp.uint8_t] label_filter
+        cnp.ndarray[cnp.uint8_t, mode='c'] label_filter
 
         cnp.intp_t current_node, source_node, right_node, left_node, new_node
         cnp.intp_t i, j, n_samples, num_features
@@ -158,10 +159,10 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
     return mst
 
 @cython.wraparound(True)
-cpdef cnp.ndarray[cnp.float64_t, ndim=2] label(MST_edge_t[:] mst):
+cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(MST_edge_t[::1] mst):
 
     cdef:
-        cnp.ndarray[cnp.float64_t, ndim=2] single_linkage
+        cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage
 
         # Note mst.shape[0] is one fewer than the number of samples
         cnp.intp_t n_samples = mst.shape[0] + 1

From 0182bc9b605e436ea1d4d1925cf575b7ecf52f78 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 8 Nov 2022 16:40:55 -0500
Subject: [PATCH 05/11] Updated authorship

---
 sklearn/cluster/_hdbscan/_linkage.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index eb5d3191c61ef..78751816f1482 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -1,6 +1,7 @@
 # Minimum spanning tree single linkage implementation for hdbscan
 # Authors: Leland McInnes <leland.mcinnes@gmail.com>
 #          Steve Astels <sastels@gmail.com>
+#          Meekail Zain <zainmeekail@gmail.com>
 # License: 3-clause BSD
 
 cimport numpy as cnp

From 39e7d7e59dfdcf52488bd0d05dc7c43e67becef2 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 9 Nov 2022 15:32:59 -0500
Subject: [PATCH 06/11] Feedback from review

---
 sklearn/cluster/_hdbscan/_linkage.pyx | 92 ++++++++++++---------------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index 78751816f1482..a92d368c0d621 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -5,12 +5,9 @@
 # License: 3-clause BSD
 
 cimport numpy as cnp
-
-import numpy as np
-import cython
-
 from libc.float cimport DBL_MAX
 
+import numpy as np
 from ...metrics._dist_metrics cimport DistanceMetric
 from ...cluster._hierarchical_fast cimport UnionFind
 from ...utils._typedefs cimport ITYPE_t, DTYPE_t
@@ -18,38 +15,36 @@ from ...utils._typedefs import ITYPE, DTYPE
 
 # Numpy structured dtype representing a single ordered edge in Prim's algorithm
 MST_edge_dtype = np.dtype([
-    ("current_node", np.intp),
-    ("next_node", np.intp),
+    ("current_node", np.int64),
+    ("next_node", np.int64),
     ("distance", np.float64),
 ])
 
 # Packed shouldn't make a difference since they're all 8-byte quantities,
 # but it's included just to be safe.
 ctypedef packed struct MST_edge_t:
-    cnp.intp_t current_node
-    cnp.intp_t next_node
+    cnp.int64_t current_node
+    cnp.int64_t next_node
     cnp.float64_t distance
 
 cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
     cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability
 ):
     cdef:
-        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] node_labels
-        cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] current_labels
+        cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels
         cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right
         cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
 
         cnp.ndarray[cnp.uint8_t, mode='c'] label_filter
 
-        cnp.intp_t n_samples = mutual_reachability.shape[0]
-        cnp.intp_t current_node, new_node_index, new_node, i
+        cnp.int64_t n_samples = mutual_reachability.shape[0]
+        cnp.int64_t current_node, new_node_index, new_node, i
 
     mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
-    node_labels = np.arange(n_samples, dtype=np.intp)
+    current_labels = np.arange(n_samples, dtype=np.int64)
     current_node = 0
-    min_reachability = np.infty * np.ones(n_samples)
-    current_labels = node_labels
-    for i in range(1, n_samples):
+    min_reachability = np.infty * np.ones(n_samples, dtype=np.float64)
+    for i in range(0, n_samples - 1):
         label_filter = current_labels != current_node
         current_labels = current_labels[label_filter]
         left = min_reachability[label_filter]
@@ -58,33 +53,29 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
 
         new_node_index = np.argmin(min_reachability)
         new_node = current_labels[new_node_index]
-        mst[i - 1].current_node = current_node
-        mst[i - 1].next_node = new_node
-        mst[i - 1].distance = min_reachability[new_node_index]
+        mst[i].current_node = current_node
+        mst[i].next_node = new_node
+        mst[i].distance = min_reachability[new_node_index]
         current_node = new_node
 
     return mst
 
 
-cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
-    cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] raw_data,
-    cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] core_distances,
+cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
+    const cnp.float64_t[:, ::1] raw_data,
+    const cnp.float64_t[::1] core_distances,
     DistanceMetric dist_metric,
     cnp.float64_t alpha=1.0
 ):
 
     cdef:
         cnp.int8_t[::1] in_tree
-        cnp.float64_t[::1] min_reachability, current_sources
-        cnp.float64_t[::1] current_core_distances = core_distances
-        cnp.float64_t[:, ::1] raw_data_view = raw_data
+        cnp.float64_t[::1] min_reachability
+        cnp.int64_t[::1] current_sources
         cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
-        cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] mst_arr
-
-        cnp.ndarray[cnp.uint8_t, mode='c'] label_filter
 
-        cnp.intp_t current_node, source_node, right_node, left_node, new_node
-        cnp.intp_t i, j, n_samples, num_features
+        cnp.int64_t current_node, source_node, right_node, left_node, new_node, next_node_source
+        cnp.int64_t i, j, n_samples, num_features
 
         cnp.float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
         cnp.float64_t next_node_min_reach, pair_distance, next_node_core_dist
@@ -95,16 +86,16 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
     mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
 
     in_tree = np.zeros(n_samples, dtype=np.int8)
-    min_reachability = np.infty * np.ones(n_samples)
-    current_sources = np.ones(n_samples)
+    min_reachability = np.infty * np.ones(n_samples, dtype=np.float64)
+    current_sources = np.ones(n_samples, dtype=np.int64)
 
     current_node = 0
 
-    for i in range(1, n_samples):
+    for i in range(0, n_samples - 1):
 
         in_tree[current_node] = 1
 
-        current_node_core_dist = current_core_distances[current_node]
+        current_node_core_dist = core_distances[current_node]
 
         new_reachability = DBL_MAX
         source_node = 0
@@ -118,8 +109,8 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
             next_node_source = current_sources[j]
 
             pair_distance = dist_metric.dist(
-                &raw_data_view[current_node, 0],
-                &raw_data_view[j, 0],
+                &raw_data[current_node, 0],
+                &raw_data[j, 0],
                 num_features
             )
 
@@ -152,27 +143,26 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1] mst_from_data_matrix(
                     source_node = next_node_source
                     new_node = j
 
-        mst[i - 1].current_node = source_node
-        mst[i - 1].next_node = new_node
-        mst[i - 1].distance = new_reachability
+        mst[i].current_node = source_node
+        mst[i].next_node = new_node
+        mst[i].distance = new_reachability
         current_node = new_node
 
     return mst
 
-@cython.wraparound(True)
-cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(MST_edge_t[::1] mst):
+cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(const MST_edge_t[::1] mst):
 
     cdef:
         cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage
 
         # Note mst.shape[0] is one fewer than the number of samples
-        cnp.intp_t n_samples = mst.shape[0] + 1
-        cnp.intp_t current_node_ancestor, next_node_ancestor
-        cnp.intp_t current_node, next_node, index
+        cnp.int64_t n_samples = mst.shape[0] + 1
+        cnp.int64_t current_node_cluster, next_node_cluster
+        cnp.int64_t current_node, next_node, index
         cnp.float64_t distance
+        UnionFind U = UnionFind(n_samples)
 
     single_linkage = np.zeros((n_samples - 1, 4))
-    U = UnionFind(n_samples)
 
     for i in range(n_samples - 1):
 
@@ -180,16 +170,18 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(MST_edge_t[::1] mst):
         next_node = mst[i].next_node
         distance = mst[i].distance
 
-        current_node_ancestor, next_node_ancestor = (
+        current_node_cluster, next_node_cluster = (
             U.fast_find(current_node),
             U.fast_find(next_node)
         )
 
-        single_linkage[i][0] = current_node_ancestor
-        single_linkage[i][1] = next_node_ancestor
+        # TODO: Update this to an array of structs (AoS).
+        # Should be done simultaneously in _tree.pyx to ensure compatability.
+        single_linkage[i][0] = <cnp.float64_t> current_node_cluster
+        single_linkage[i][1] = <cnp.float64_t> next_node_cluster
         single_linkage[i][2] = distance
-        single_linkage[i][3] = U.size[current_node_ancestor] + U.size[next_node_ancestor]
+        single_linkage[i][3] = U.size[current_node_cluster] + U.size[next_node_cluster]
 
-        U.union(current_node_ancestor, next_node_ancestor)
+        U.union(current_node_cluster, next_node_cluster)
 
     return single_linkage

From 9c38badb91e6a31f7d2b382eb4ad1d8cef5b1fcc Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 9 Nov 2022 18:04:33 -0500
Subject: [PATCH 07/11] Refactor and remove alpha

---
 sklearn/cluster/_hdbscan/_linkage.pyx | 57 ++++++++++++++++++++++++---
 sklearn/cluster/_hdbscan/hdbscan.py   | 18 ++-------
 2 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index a92d368c0d621..e7f3f8feb21ef 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -30,6 +30,27 @@ ctypedef packed struct MST_edge_t:
 cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
     cnp.ndarray[cnp.float64_t, ndim=2] mutual_reachability
 ):
+    """Compute the Minimum Spanning Tree (MST) representation of the mutual-
+    reachability graph using Prim's algorithm.
+
+    Parameters
+    ----------
+    mutual_reachability : ndarray of shape (n_samples, n_samples)
+        Array of mutual-reachabilities between samples.
+
+    Returns
+    -------
+    mst: ndarray of shape (n_samples - 1,)
+        The MST representation of the mutual-reahability graph. The MST is
+        represented as a collecteion of edges. Each edge is an instance of a
+        custom dtype `MST_edge_dtype` with the following specification:
+
+        MST_edge_dtype = np.dtype([
+            ("current_node", np.int64),
+            ("next_node", np.int64),
+            ("distance", np.float64),
+        ])
+    """
     cdef:
         cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels
         cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right
@@ -65,8 +86,37 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
     const cnp.float64_t[:, ::1] raw_data,
     const cnp.float64_t[::1] core_distances,
     DistanceMetric dist_metric,
-    cnp.float64_t alpha=1.0
 ):
+    """Compute the Minimum Spanning Tree (MST) representation of the mutual-
+    reachability graph generated from the provided `raw_data` and
+    `core_distances` using Prim's algorithm.
+
+    Parameters
+    ----------
+    raw_data : ndarray of shape (n_samples, n_features)
+        Input array of data samples.
+
+    core_distances : ndarray of shape (n_samples,)
+        An array containing the core-distance calculated for each corresponding
+        sample.
+
+    dist_metric : DistanceMetric
+        The distance metric to use when calculating pairwise distances for
+        determining mutual-reachability.
+
+    Returns
+    -------
+    mst: ndarray of shape (n_samples - 1,)
+        The MST representation of the mutual-reahability graph. The MST is
+        represented as a collecteion of edges. Each edge is an instance of a
+        custom dtype `MST_edge_dtype` with the following specification:
+
+        MST_edge_dtype = np.dtype([
+            ("current_node", np.int64),
+            ("next_node", np.int64),
+            ("distance", np.float64),
+        ])
+    """
 
     cdef:
         cnp.int8_t[::1] in_tree
@@ -114,9 +164,6 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
                 num_features
             )
 
-            if alpha != 1.0:
-                pair_distance /= alpha
-
             next_node_core_dist = core_distances[j]
             mutual_reachability_distance = max(
                 current_node_core_dist,
@@ -150,7 +197,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
 
     return mst
 
-cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] label(const MST_edge_t[::1] mst):
+cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst):
 
     cdef:
         cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 95c114dafb070..611dd2113cdf1 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -22,7 +22,7 @@
 from ...utils._param_validation import Interval, StrOptions
 from ...utils.validation import _assert_all_finite
 from ._linkage import (
-    label,
+    make_single_linkage,
     mst_from_mutual_reachability,
     mst_from_data_matrix,
     MST_edge_dtype,
@@ -110,13 +110,12 @@ def _process_mst(min_spanning_tree):
     row_order = np.argsort(min_spanning_tree["distance"])
     min_spanning_tree = min_spanning_tree[row_order]
     # Convert edge list into standard hierarchical clustering format
-    return label(min_spanning_tree)
+    return make_single_linkage(min_spanning_tree)
 
 
 def _hdbscan_brute(
     X,
     min_samples=5,
-    alpha=None,
     metric="euclidean",
     n_jobs=None,
     copy=False,
@@ -132,7 +131,6 @@ def _hdbscan_brute(
         distance_matrix = pairwise_distances(
             X, metric=metric, n_jobs=n_jobs, **metric_params
         )
-    distance_matrix /= alpha
 
     # max_dist is only relevant for sparse and is ignored for dense
     max_dist = metric_params.get("max_dist", 0.0)
@@ -163,7 +161,6 @@ def _hdbscan_prims(
     X,
     algo,
     min_samples=5,
-    alpha=1.0,
     metric="euclidean",
     leaf_size=40,
     n_jobs=None,
@@ -188,8 +185,7 @@ def _hdbscan_prims(
     dist_metric = DistanceMetric.get_metric(metric, **metric_params)
 
     # Mutual reachability distance is implicit in mst_from_data_matrix
-    min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha)
-
+    min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric)
     return _process_mst(min_spanning_tree)
 
 
@@ -294,10 +290,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
     metric_params : dict, default=None
         Arguments passed to the distance metric.
 
-    alpha : float, default=1.0
-        A distance scaling parameter as used in robust single linkage.
-        See [3]_ for more information.
-
     algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto"
         Exactly which algorithm to use for computing core distances; By default
         this is set to `"auto"` which attempts to use a
@@ -458,7 +450,6 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         ],
         "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable],
         "metric_params": [dict, None],
-        "alpha": [Interval(Real, left=0, right=None, closed="neither")],
         "algorithm": [
             StrOptions(
                 {
@@ -485,7 +476,6 @@ def __init__(
         max_cluster_size=None,
         metric="euclidean",
         metric_params=None,
-        alpha=1.0,
         algorithm="auto",
         leaf_size=40,
         n_jobs=4,
@@ -496,7 +486,6 @@ def __init__(
     ):
         self.min_cluster_size = min_cluster_size
         self.min_samples = min_samples
-        self.alpha = alpha
         self.max_cluster_size = max_cluster_size
         self.cluster_selection_epsilon = cluster_selection_epsilon
         self.metric = metric
@@ -597,7 +586,6 @@ def fit(self, X, y=None):
         kwargs = dict(
             X=X,
             min_samples=self._min_samples,
-            alpha=self.alpha,
             metric=self.metric,
             n_jobs=self.n_jobs,
             **self._metric_params,

From e8ad9339d140a446024a579cae5ac564d6797cfd Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Wed, 9 Nov 2022 18:13:46 -0500
Subject: [PATCH 08/11] Added documentation

---
 sklearn/cluster/_hdbscan/_linkage.pyx | 29 +++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index e7f3f8feb21ef..c35dc6e1c865c 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -40,7 +40,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
 
     Returns
     -------
-    mst: ndarray of shape (n_samples - 1,)
+    mst : ndarray of shape (n_samples - 1,)
         The MST representation of the mutual-reahability graph. The MST is
         represented as a collecteion of edges. Each edge is an instance of a
         custom dtype `MST_edge_dtype` with the following specification:
@@ -106,7 +106,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
 
     Returns
     -------
-    mst: ndarray of shape (n_samples - 1,)
+    mst : ndarray of shape (n_samples - 1,)
         The MST representation of the mutual-reahability graph. The MST is
         represented as a collecteion of edges. Each edge is an instance of a
         custom dtype `MST_edge_dtype` with the following specification:
@@ -198,7 +198,32 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
     return mst
 
 cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST_edge_t[::1] mst):
+    """Construct a single-linkage tree from an MST.
 
+    Parameters
+    ----------
+    mst : ndarray of shape (n_samples - 1,)
+        The MST representation of the mutual-reahability graph. The MST is
+        represented as a collecteion of edges. Each edge is an instance of a
+        custom dtype `MST_edge_dtype` with the following specification:
+
+        MST_edge_dtype = np.dtype([
+            ("current_node", np.int64),
+            ("next_node", np.int64),
+            ("distance", np.float64),
+        ])
+
+    Returns
+    -------
+    single_linkage : ndarray of shape (n_samples - 1, 4)
+        The single-linkage tree tree (dendrogram) built from the MST. Each
+        of the array represents the following:
+
+        - left node/cluster
+        - right node/cluster
+        - distance
+        - new cluster size
+    """
     cdef:
         cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] single_linkage
 

From 50847ec97db09daf6fdb3163266fe2fe98846fc7 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Tue, 29 Nov 2022 17:49:50 -0500
Subject: [PATCH 09/11] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/cluster/_hdbscan/_linkage.pyx | 45 +++++++--------------------
 1 file changed, 11 insertions(+), 34 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index c35dc6e1c865c..9d47d40847e10 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -40,16 +40,9 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
 
     Returns
     -------
-    mst : ndarray of shape (n_samples - 1,)
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
         The MST representation of the mutual-reahability graph. The MST is
-        represented as a collecteion of edges. Each edge is an instance of a
-        custom dtype `MST_edge_dtype` with the following specification:
-
-        MST_edge_dtype = np.dtype([
-            ("current_node", np.int64),
-            ("next_node", np.int64),
-            ("distance", np.float64),
-        ])
+        represented as a collecteion of edges.
     """
     cdef:
         cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels
@@ -64,7 +57,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
     mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
     current_labels = np.arange(n_samples, dtype=np.int64)
     current_node = 0
-    min_reachability = np.infty * np.ones(n_samples, dtype=np.float64)
+    min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64)
     for i in range(0, n_samples - 1):
         label_filter = current_labels != current_node
         current_labels = current_labels[label_filter]
@@ -106,16 +99,9 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
 
     Returns
     -------
-    mst : ndarray of shape (n_samples - 1,)
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
         The MST representation of the mutual-reahability graph. The MST is
-        represented as a collecteion of edges. Each edge is an instance of a
-        custom dtype `MST_edge_dtype` with the following specification:
-
-        MST_edge_dtype = np.dtype([
-            ("current_node", np.int64),
-            ("next_node", np.int64),
-            ("distance", np.float64),
-        ])
+        represented as a collecteion of edges.
     """
 
     cdef:
@@ -136,7 +122,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
     mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
 
     in_tree = np.zeros(n_samples, dtype=np.int8)
-    min_reachability = np.infty * np.ones(n_samples, dtype=np.float64)
+    min_reachability = np.full(n_samples, fill_value=np.infty, dtype=np.float64)
     current_sources = np.ones(n_samples, dtype=np.int64)
 
     current_node = 0
@@ -202,16 +188,9 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST
 
     Parameters
     ----------
-    mst : ndarray of shape (n_samples - 1,)
+    mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
         The MST representation of the mutual-reahability graph. The MST is
-        represented as a collecteion of edges. Each edge is an instance of a
-        custom dtype `MST_edge_dtype` with the following specification:
-
-        MST_edge_dtype = np.dtype([
-            ("current_node", np.int64),
-            ("next_node", np.int64),
-            ("distance", np.float64),
-        ])
+        represented as a collecteion of edges.
 
     Returns
     -------
@@ -234,7 +213,7 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST
         cnp.float64_t distance
         UnionFind U = UnionFind(n_samples)
 
-    single_linkage = np.zeros((n_samples - 1, 4))
+    single_linkage = np.zeros((n_samples - 1, 4), dtype=np.float64)
 
     for i in range(n_samples - 1):
 
@@ -242,10 +221,8 @@ cpdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] make_single_linkage(const MST
         next_node = mst[i].next_node
         distance = mst[i].distance
 
-        current_node_cluster, next_node_cluster = (
-            U.fast_find(current_node),
-            U.fast_find(next_node)
-        )
+        current_node_cluster = U.fast_find(current_node)
+        next_node_cluster = U.fast_find(next_node)
 
         # TODO: Update this to an array of structs (AoS).
         # Should be done simultaneously in _tree.pyx to ensure compatability.

From e533f0c6399f7b99db0282bcf17eed8467c9d972 Mon Sep 17 00:00:00 2001
From: Meekail Zain <zainmeekail@gmail.com>
Date: Tue, 29 Nov 2022 18:18:18 -0500
Subject: [PATCH 10/11] Review feedback and revert alpha changes

---
 sklearn/cluster/_hdbscan/_linkage.pyx |  6 ++++++
 sklearn/cluster/_hdbscan/hdbscan.py   | 19 ++++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index 9d47d40847e10..1957a27eab69f 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -45,6 +45,8 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
         represented as a collecteion of edges.
     """
     cdef:
+        # Note: we utilize ndarray's over memory-views to make use of numpy
+        # binary indexing and sub-selection below.
         cnp.ndarray[cnp.int64_t, ndim=1, mode='c'] current_labels
         cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] min_reachability, left, right
         cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
@@ -79,6 +81,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
     const cnp.float64_t[:, ::1] raw_data,
     const cnp.float64_t[::1] core_distances,
     DistanceMetric dist_metric,
+    cnp.float64_t alpha=1.0
 ):
     """Compute the Minimum Spanning Tree (MST) representation of the mutual-
     reachability graph generated from the provided `raw_data` and
@@ -150,6 +153,9 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
                 num_features
             )
 
+            if alpha != 1.0:
+                pair_distance /= alpha
+
             next_node_core_dist = core_distances[j]
             mutual_reachability_distance = max(
                 current_node_core_dist,
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
index 611dd2113cdf1..4f1fcf1962d0b 100644
--- a/sklearn/cluster/_hdbscan/hdbscan.py
+++ b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -75,8 +75,10 @@ def _brute_mst(mutual_reachability, min_samples, sparse=False):
     # Compute the minimum spanning tree for the sparse graph
     sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability)
     rows, cols = sparse_min_spanning_tree.nonzero()
-    mst = np.vstack((rows, cols, sparse_min_spanning_tree.data))
-    mst = np.core.records.fromarrays(mst, dtype=MST_edge_dtype, shape=(mst.shape[1],))
+    mst = np.core.records.fromarrays(
+        [rows, cols, sparse_min_spanning_tree.data],
+        dtype=MST_edge_dtype,
+    )
     return mst
 
 
@@ -116,6 +118,7 @@ def _process_mst(min_spanning_tree):
 def _hdbscan_brute(
     X,
     min_samples=5,
+    alpha=None,
     metric="euclidean",
     n_jobs=None,
     copy=False,
@@ -131,6 +134,7 @@ def _hdbscan_brute(
         distance_matrix = pairwise_distances(
             X, metric=metric, n_jobs=n_jobs, **metric_params
         )
+    distance_matrix /= alpha
 
     # max_dist is only relevant for sparse and is ignored for dense
     max_dist = metric_params.get("max_dist", 0.0)
@@ -161,6 +165,7 @@ def _hdbscan_prims(
     X,
     algo,
     min_samples=5,
+    alpha=1.0,
     metric="euclidean",
     leaf_size=40,
     n_jobs=None,
@@ -185,7 +190,7 @@ def _hdbscan_prims(
     dist_metric = DistanceMetric.get_metric(metric, **metric_params)
 
     # Mutual reachability distance is implicit in mst_from_data_matrix
-    min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric)
+    min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha)
     return _process_mst(min_spanning_tree)
 
 
@@ -290,6 +295,10 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
     metric_params : dict, default=None
         Arguments passed to the distance metric.
 
+    alpha : float, default=1.0
+        A distance scaling parameter as used in robust single linkage.
+        See [3]_ for more information.
+
     algorithm : {"auto", "brute", "kdtree", "balltree"}, default="auto"
         Exactly which algorithm to use for computing core distances; By default
         this is set to `"auto"` which attempts to use a
@@ -450,6 +459,7 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
         ],
         "metric": [StrOptions(set(FAST_METRICS + ["precomputed"])), callable],
         "metric_params": [dict, None],
+        "alpha": [Interval(Real, left=0, right=None, closed="neither")],
         "algorithm": [
             StrOptions(
                 {
@@ -476,6 +486,7 @@ def __init__(
         max_cluster_size=None,
         metric="euclidean",
         metric_params=None,
+        alpha=1.0,
         algorithm="auto",
         leaf_size=40,
         n_jobs=4,
@@ -486,6 +497,7 @@ def __init__(
     ):
         self.min_cluster_size = min_cluster_size
         self.min_samples = min_samples
+        self.alpha = alpha
         self.max_cluster_size = max_cluster_size
         self.cluster_selection_epsilon = cluster_selection_epsilon
         self.metric = metric
@@ -586,6 +598,7 @@ def fit(self, X, y=None):
         kwargs = dict(
             X=X,
             min_samples=self._min_samples,
+            alpha=self.alpha,
             metric=self.metric,
             n_jobs=self.n_jobs,
             **self._metric_params,

From f154164684046d3769c253faa6075ad8a3562f84 Mon Sep 17 00:00:00 2001
From: Meekail Zain <34613774+Micky774@users.noreply.github.com>
Date: Tue, 6 Dec 2022 17:39:36 -0500
Subject: [PATCH 11/11] Update sklearn/cluster/_hdbscan/_linkage.pyx

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 sklearn/cluster/_hdbscan/_linkage.pyx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/cluster/_hdbscan/_linkage.pyx b/sklearn/cluster/_hdbscan/_linkage.pyx
index 1957a27eab69f..fd9888ac4da82 100644
--- a/sklearn/cluster/_hdbscan/_linkage.pyx
+++ b/sklearn/cluster/_hdbscan/_linkage.pyx
@@ -153,8 +153,7 @@ cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
                 num_features
             )
 
-            if alpha != 1.0:
-                pair_distance /= alpha
+            pair_distance /= alpha
 
             next_node_core_dist = core_distances[j]
             mutual_reachability_distance = max(