diff --git a/sklearn/metrics/_dist_metrics.pyx.tp b/sklearn/metrics/_dist_metrics.pyx.tp index b7d3d1f4d86a6..4e1e53933ccb4 100644 --- a/sklearn/metrics/_dist_metrics.pyx.tp +++ b/sklearn/metrics/_dist_metrics.pyx.tp @@ -232,6 +232,7 @@ cdef class DistanceMetric: # metric mappings # These map from metric id strings to class names METRIC_MAPPING{{name_suffix}} = { + 'precomputed': PrecomputedDistanceMatrix{{name_suffix}} 'euclidean': EuclideanDistance{{name_suffix}}, 'l2': EuclideanDistance{{name_suffix}}, 'minkowski': MinkowskiDistance{{name_suffix}}, @@ -359,13 +360,17 @@ cdef class DistanceMetric{{name_suffix}}(DistanceMetric): **User-defined distance:** - =========== =============== ======= - identifier class name args - ----------- --------------- ------- - "pyfunc" PyFuncDistance func - =========== =============== ======= + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "precomputed" PrecomputedDistanceMatrix precomputed + "pyfunc" PyFuncDistance func + =========== =============== ======= - Here ``func`` is a function which takes two one-dimensional numpy + "precomputed" indicates that the user has the distance computed + and wants to pass in the precomputed as an argument. + + ``func`` is a function which takes two one-dimensional numpy arrays, and returns a distance. Note that in order to be used within the BallTree, the distance must be a true metric: i.e. it must satisfy the following properties diff --git a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp index 51fb745dca784..2f2d8e03c27a0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp @@ -13,6 +13,7 @@ from ._classmode cimport WeightingStrategy {{for name_suffix in ["32", "64"]}} from ._argkmin cimport ArgKmin{{name_suffix}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} +from ._datasets_pair cimport PrecomputedDistanceMatrix{{name_suffix}} cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): """ diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp index 9578129993c37..3b8ae51b970fd 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -20,8 +20,7 @@ cdef class BaseDistancesReduction{{name_suffix}}: Implementations inherit from this template and may override the several defined hooks as needed in order to easily extend functionality with minimal redundant code. - """ - + """ cdef: readonly DatasetsPair{{name_suffix}} datasets_pair diff --git a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp index 2bbfd74e2c2c3..56e27b0b6b2e0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -127,6 +127,9 @@ cdef class BaseDistancesReduction{{name_suffix}}: Implementations inherit from this template and may override the several defined hooks as needed in order to easily extend functionality with minimal redundant code. + + If metric is 'precomputed' and the precomputed matrix is provided, + a subclass must be able to access it through the compute method. """ def __init__( @@ -137,7 +140,6 @@ cdef class BaseDistancesReduction{{name_suffix}}: ): cdef: intp_t X_n_full_chunks, Y_n_full_chunks - if chunk_size is None: chunk_size = get_config().get("pairwise_dist_chunk_size", 256) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp index 1e57b3291a8f4..fe91bf088a8c5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -29,6 +29,11 @@ cdef class DatasetsPair{{name_suffix}}: cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil +cdef class PrecomputedDistanceMatrix{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] distance_matrix + + cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): cdef: const {{INPUT_DTYPE_t}}[:, ::1] X diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp index 2c3ca44047145..3a0e3f27576f6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -53,8 +53,8 @@ cdef class DatasetsPair{{name_suffix}}: @classmethod def get_for( cls, - X, - Y, + X = None, + Y = None, metric="euclidean", dict metric_kwargs=None, ) -> DatasetsPair{{name_suffix}}: @@ -98,6 +98,9 @@ cdef class DatasetsPair{{name_suffix}}: metric_kwargs = copy.copy(metric_kwargs) metric_kwargs.pop("X_norm_squared", None) metric_kwargs.pop("Y_norm_squared", None) + if metric = precomputed: + return PrecomputedDistanceMatrix{{name_suffix}}(precomputed) + cdef: {{DistanceMetric}} distance_metric = DistanceMetric.get_metric( metric, @@ -158,6 +161,43 @@ cdef class DatasetsPair{{name_suffix}}: # TODO: add "with gil: raise" here when supporting Cython 3.0 return -1 + +@final +cdef class PrecomputedDistanceMatrix{{name_suffix}}(DatasetsPair{{name_suffix}}): + """A subclass of DatasetsPair + + Parameters: must receive precomputed_distance: ndarray of shape + (n_samples_X, n_samples_Y), + Must be C-contiguous. + """ + + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] precomputed_distance, + ): + super().__init__( + distance_metric=DistanceMetric{{name_suffix}}(), + n_features=0, + ) + # This array has already been checked. + self.distance_matrix = precomputed_distance + + @final + cdef intp_t n_samples_X(self) noexcept nogil: + return self.distance_matrix.shape[0] + + @final + cdef intp_t n_samples_Y(self) noexcept nogil: + return self.distance_matrix.shape[1] + + @final + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_matrix[i, j] + + @final + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_matrix[i, j] + @final cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): """Compute distances between row vectors of two arrays. diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index d8307cbe84eaa..14cee8d50013a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -81,10 +81,12 @@ def valid_metrics(cls) -> List[str]: "hamming", *BOOL_METRICS, } - return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded) + return sorted( + ({"sqeuclidean", "precomputed"} | set(METRIC_MAPPING64.keys())) - excluded + ) @classmethod - def is_usable_for(cls, X, Y, metric) -> bool: + def is_usable_for(cls, X=None, Y=None, metric="euclidean") -> bool: """Return True if the dispatcher can be used for the given parameters. @@ -96,6 +98,8 @@ def is_usable_for(cls, X, Y, metric) -> bool: Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) Input data. + precomputed: ndarray of shape (n_samples_X, n_samples_Y) + metric : str, default='euclidean' The distance metric to use. For a list of available metrics, see the documentation of @@ -105,7 +109,15 @@ def is_usable_for(cls, X, Y, metric) -> bool: ------- True if the dispatcher can be used, else False. """ - + if metric == "precomputed": + if X is not None and Y is None: + is_usable = True + else: + is_usable = False + + # is_usable = (X is not None and Y is not None) ^ bool(precomputed) + if is_usable == False: + return is_usable # FIXME: the current Cython implementation is too slow for a large number of # features. We temporarily disable it to fallback on SciPy's implementation. # See: https://github.com/scikit-learn/scikit-learn/issues/28191 @@ -188,9 +200,9 @@ class ArgKmin(BaseDistancesReductionDispatcher): @classmethod def compute( cls, - X, - Y, - k, + X=None, + Y=None, + k=None, metric="euclidean", chunk_size=None, metric_kwargs=None, @@ -277,6 +289,25 @@ def compute( for the concrete implementation are therefore freed when this classmethod returns. """ + """ + if X is None and Y is None and precomputed_matrix is None: + raise ValueError("Either X and Y or precomputed_matrix must be provided.") + elif X is not None and Y is not None and precomputed_matrix is not None: + raise ValueError( + "Only one of X and Y or precomputed_matrix must be provided." + ) + elif X is None and Y is not None: + raise ValueError("Y should not be provided without X.") + elif X is not None and Y is None: + raise ValueError("X should not be provided without Y.") + """ + + if metric == "precomputed": + if X is None: + raise ValueError("X should be provided as a precomputed value") + if Y is not None: + raise ValueError("Y should not be provided as a precomputed value") + if X.dtype == Y.dtype == np.float64: return ArgKmin64.compute( X=X, @@ -326,9 +357,9 @@ class RadiusNeighbors(BaseDistancesReductionDispatcher): @classmethod def compute( cls, - X, - Y, - radius, + X=None, + Y=None, + radius=None, metric="euclidean", chunk_size=None, metric_kwargs=None, @@ -421,6 +452,24 @@ def compute( for the concrete implementation are therefore freed when this classmethod returns. """ + """ + if X is None and Y is None and precomputed is None: + raise ValueError("Either X and Y or precomputed must be provided.") + elif X is not None and Y is not None and precomputed is not None: + raise ValueError("Only one of X and Y or precomputed must be provided.") + elif X is None and Y is not None: + raise ValueError("Y should not be provided without X.") + elif X is not None and Y is None: + raise ValueError("X should not be provided without Y.") + elif precomputed: + return precomputed + """ + if metric == "precomputed": + if X is None: + raise ValueError("X should be provided as a precomputed value") + if Y is not None: + raise ValueError("Y should not be provided as a precomputed value") + if X.dtype == Y.dtype == np.float64: return RadiusNeighbors64.compute( X=X, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index d0567f2ead804..64c4b7b730833 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -101,7 +101,7 @@ cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}) # Fall back on a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. pda = RadiusNeighbors{{name_suffix}}( - datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, precomputed, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, strategy=strategy, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp index 0a9b22251843e..7e62b3d77a2fc 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp @@ -48,7 +48,7 @@ cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix} # Use a generic implementation that handles most scipy # metrics by computing the distances between 2 vectors at a time. pda = RadiusNeighborsClassMode{{name_suffix}}( - datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, precomputed, metric, metric_kwargs), radius=radius, chunk_size=chunk_size, strategy=strategy, diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 0ea6d5d094d56..afea771a8c196 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -103,13 +103,43 @@ def assert_same_distances_for_common_neighbors( ) from e +def assert_precomputed(precomputed, n_samples_X, n_samples_Y): + """ + Validates a precomputed matrix for compatibility. + + Parameters: + precomputed (np.ndarray): The precomputed matrix to validate. + n_samples_X (int): The expected number of rows in the matrix. + n_samples_Y (int): The expected number of columns in the matrix. + + Raises: + AssertionError: If the input is not valid. + """ + # Check if the input is a numpy array + if not isinstance(precomputed, np.ndarray): + raise AssertionError("Input must be a numpy array.") + + # Check if the array has the correct data type + if precomputed.dtype not in [np.float32, np.float64]: + raise AssertionError( + "Precomputed matrix must be of type float (float32 or float64)." + ) + + # Check if the array is empty + if precomputed.size == 0: + raise AssertionError("Precomputed matrix should not be empty.") + + # Check if the dimensions match the expected shape + expected_shape = (n_samples_X, n_samples_Y) + if precomputed.shape != expected_shape: + raise AssertionError( + f"Incorrect dimensions for precomputed matrix. " + f"Expected: {expected_shape}, Got: {precomputed.shape}." + ) + + def assert_no_missing_neighbors( - query_idx, - dist_row_a, - dist_row_b, - indices_row_a, - indices_row_b, - threshold, + query_idx, dist_row_a, dist_row_b, indices_row_a, indices_row_b, threshold ): """Compare the indices of neighbors in two results sets. @@ -348,6 +378,128 @@ def assert_compatible_radius_results( } + + + + + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_all_inputs_none(cls): + """Test that ValueError is raised when all inputs are None.""" + with pytest.raises( + ValueError, match="Either X and Y or X as precomputed matrix must be provided" + ): + cls.compute(X=None, Y=None, metric="precomputed") + + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_with_y(cls): + """Test that ValueError is raised when Y is provided in precomputed mode.""" + X = np.random.rand(10, 10) # Precomputed matrix + Y = np.random.rand(10, 5) + with pytest.raises( + ValueError, match="Y should not be provided with metric='precomputed'" + ): + cls.compute(X=X, Y=Y, metric="precomputed") + + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_invalid_metric(cls): + """Test that ValueError is raised when metric is not 'precomputed' for precomputed input.""" + X = np.random.rand(10, 10) # Precomputed matrix + with pytest.raises( + ValueError, match="Metric must be 'precomputed' when X is a precomputed matrix" + ): + cls.compute(X=X, Y=None, metric="euclidean") + + +@pytest.mark.parametrize("cls", [ArgKmin, RadiusNeighbors]) +def test_precompute_valid_matrix(cls): + """Test valid precomputed matrix with correct dimensions.""" + X_data = np.random.rand(5, 3) + Y_data = np.random.rand(4, 3) + D = pairwise_distances(X_data, Y_data) # Shape: (5, 4) + if cls == ArgKmin: + result = cls.compute(X=D, k=2, metric="precomputed") + assert result.shape == (5, 2) + else: # RadiusNeighbors + result = cls.compute(X=D, radius=1.0, metric="precomputed") + assert len(result) == 5 # List of indices per sample + + +def test_precompute_consistency(): + """Test consistency between precomputed and computed distances for ArgKmin.""" + X_data = np.array([[1, 2], [3, 4], [5, 6]]) # 3 samples, 2 features + Y_data = np.array([[7, 8], [9, 10]]) # 2 samples, 2 features + D = pairwise_distances(X_data, Y_data) # Shape: (3, 2) + k = 2 + result_precomputed = ArgKmin.compute(X=D, k=k, metric="precomputed") + result_computed = ArgKmin.compute(X=X_data, Y=Y_data, k=k, metric="euclidean") + np.testing.assert_allclose(result_precomputed, result_computed, rtol=1e-5) + + +def test_assert_precomputed(): + """Test validation of precomputed matrix.""" + n_samples_X, n_samples_Y = 5, 5 + # Success case + valid_matrix = np.random.rand(n_samples_X, n_samples_Y).astype(np.float64) + assert_precomputed(valid_matrix, n_samples_X, n_samples_Y) + + # Failure: Not a numpy array + with pytest.raises(AssertionError, match="Input must be a numpy array"): + assert_precomputed([[1, 2], [3, 4]], n_samples_X, n_samples_Y) + + # Failure: Incorrect dtype + invalid_dtype = np.random.randint(0, 10, (n_samples_X, n_samples_Y)) + with pytest.raises( + AssertionError, match="Precomputed matrix must be of type float" + ): + assert_precomputed(invalid_dtype, n_samples_X, n_samples_Y) + + # Failure: Empty array + with pytest.raises(AssertionError, match="Precomputed matrix should not be empty"): + assert_precomputed(np.array([]), n_samples_X, n_samples_Y) + + # Failure: Incorrect dimensions + incorrect_shape = np.random.rand(n_samples_X, n_samples_X).astype(np.float64) + with pytest.raises( + AssertionError, match="Incorrect dimensions for precomputed matrix" + ): + assert_precomputed(incorrect_shape, n_samples_X, n_samples_Y) + + +def assert_precomputed(precomputed, n_samples_X, n_samples_Y): + """ + Validates a precomputed matrix for compatibility. + + Parameters + ---------- + precomputed : np.ndarray + The precomputed matrix to validate. + n_samples_X : int + The expected number of rows in the matrix. + n_samples_Y : int + The expected number of columns in the matrix. + + Raises + ------ + AssertionError + If the input is not a valid numpy array, has incorrect dtype, is empty, + or has incorrect dimensions. + """ + if not isinstance(precomputed, np.ndarray): + raise AssertionError("Input must be a numpy array") + if precomputed.dtype not in [np.float32, np.float64]: + raise AssertionError("Precomputed matrix must be of type float") + if precomputed.size == 0: + raise AssertionError("Precomputed matrix should not be empty") + expected_shape = (n_samples_X, n_samples_Y) + if precomputed.shape != expected_shape: + raise AssertionError( + f"Incorrect dimensions for precomputed matrix. Expected: {expected_shape}, Got: {precomputed.shape}" + ) + + def test_assert_compatible_argkmin_results(): atol = 1e-7 rtol = 0.0