From 722033cf96341531802bb34b9d17af5e38b3dc6c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 30 Mar 2023 17:56:14 -0600 Subject: [PATCH] Co-authored-by: @jjerphan Add initial commit Signed-off-by: Adam Li --- sklearn/metrics/_dist_metrics.pyx.tp | 13 +++++ .../_datasets_pair.pxd.tp | 5 ++ .../_datasets_pair.pyx.tp | 47 +++++++++++++++++++ .../_dispatcher.py | 4 +- 4 files changed, 68 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index ed09552ed5914..9809d6fea009a 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -100,6 +100,7 @@ METRIC_MAPPING{{name_suffix}} = { 'jaccard': JaccardDistance{{name_suffix}}, 'dice': DiceDistance{{name_suffix}}, 'kulsinski': KulsinskiDistance{{name_suffix}}, + 'precomputed': PrecomputedDistanceMatrix{{name_suffix}}, 'rogerstanimoto': RogersTanimotoDistance{{name_suffix}}, 'russellrao': RussellRaoDistance{{name_suffix}}, 'sokalmichener': SokalMichenerDistance{{name_suffix}}, @@ -215,6 +216,18 @@ cdef class DistanceMetric{{name_suffix}}: "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) ================= ======================= =============================== + **Metrics with precomputed distances:** Any user can compute a distance + matrix and provide access to the distances, neighbors and other + data defined in this interface. The precomputed distance matrix should be + (n_samples_X, n_samples_Y) shape fulfilling the properties of a valid distance + metric. + + ================= ========================= =============================== + identifier class name distance function + ----------------- ------------------------- ------------------------------- + "precomputed" PrecomputedDistanceMatrix predefined + ================= ========================= =============================== + **User-defined distance:** =========== =============== ======= diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index 23337cb2b59d6..6447e0fab6941 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -32,6 +32,11 @@ cdef class DatasetsPair{{name_suffix}}: cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil +cdef class PrecomputedDistanceMatrix{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] distance_matrix + + cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:, ::1] X diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 5569c1f231d62..397ecdb021b0d 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -95,6 +95,12 @@ cdef class DatasetsPair{{name_suffix}}: # a RuntimeError), we pop it here. if metric_kwargs is not None: metric_kwargs.pop("Y_norm_squared", None) + + if metric == 'precomputed': + return PrecomputedDistanceMatrix{{name_suffix}}( + distance_matrix=Y, + ) + cdef: {{DistanceMetric}} distance_metric = {{DistanceMetric}}.get_metric( metric, @@ -154,6 +160,47 @@ cdef class DatasetsPair{{name_suffix}}: # TODO: add "with gil: raise" here when supporting Cython 3.0 return -1 + +@final +cdef class PrecomputedDistanceMatrix{{name_suffix}}(DatasetsPair{{name_suffix}}): + """A precomputed distance matrix between row vectors of two arrays. + + Parameters + ---------- + distance_matrix: ndarray of shape (n_samples_X, n_samples_Y) + Rows represent vectors. Must be C-contiguous. + """ + + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] distance_matrix, + ): + super().__init__( + # This DistanceMetric is necessary for conversion between + # reduced distance and distance (it performs no-ops). + distance_metric={{DistanceMetric}}(), + n_features=0, + ) + # Arrays have already been checked + self.distance_matrix = distance_matrix + + @final + cdef ITYPE_t n_samples_X(self) noexcept nogil: + return self.distance_matrix.shape[0] + + @final + cdef ITYPE_t n_samples_Y(self) noexcept nogil: + return self.distance_matrix.shape[1] + + @final + cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) noexcept nogil: + return self.distance_matrix[i, j] + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) noexcept nogil: + return self.distance_matrix[i, j] + + @final cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): """Compute distances between row vectors of two arrays. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index 73d98f2ebe6b2..160954539ec65 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -76,7 +76,9 @@ def valid_metrics(cls) -> List[str]: "hamming", *BOOL_METRICS, } - return sorted(({"sqeuclidean"} | set(METRIC_MAPPING.keys())) - excluded) + return sorted( + ({"sqeuclidean", "precomputed"} | set(METRIC_MAPPING.keys())) - excluded + ) @classmethod def is_usable_for(cls, X, Y, metric) -> bool: