MNT Move entropy to private function (scikit-learn#31294)

lucyleeow · jeremiedbb · web-flow · commit d4d4af8c471c · 2025-06-13T17:26:20.000+02:00
Co-authored-by: Jérémie du Boisberranger &lt;jeremie@probabl.ai&gt;
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
@@ -132,7 +132,6 @@ base estimator also does:
 Metrics
 -------
 
-- :func:`sklearn.metrics.cluster.entropy`
 - :func:`sklearn.metrics.accuracy_score`
 - :func:`sklearn.metrics.d2_tweedie_score`
 - :func:`sklearn.metrics.explained_variance_score`
diff --git a/doc/whats_new/upcoming_changes/sklearn.metrics/31294.api.rst b/doc/whats_new/upcoming_changes/sklearn.metrics/31294.api.rst
@@ -0,0 +1,2 @@
+- :func:`metrics.cluster.entropy` is deprecated and will be removed in v1.10.
+  By :user:`Lucy Liu <lucyleeow>`
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
@@ -14,6 +14,7 @@
     adjusted_rand_score,
     completeness_score,
     contingency_matrix,
+    # TODO(1.10): Remove
     entropy,
     expected_mutual_information,
     fowlkes_mallows_score,
@@ -40,6 +41,7 @@
     "consensus_score",
     "contingency_matrix",
     "davies_bouldin_score",
+    # TODO(1.10): Remove
     "entropy",
     "expected_mutual_information",
     "fowlkes_mallows_score",
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
@@ -14,6 +14,7 @@
 import numpy as np
 from scipy import sparse as sp
 
+from ...utils import deprecated
 from ...utils._array_api import _max_precision_float_dtype, get_namespace_and_device
 from ...utils._param_validation import Hidden, Interval, StrOptions, validate_params
 from ...utils.multiclass import type_of_target
@@ -530,8 +531,8 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):
     if len(labels_true) == 0:
         return 1.0, 1.0, 1.0
 
-    entropy_C = entropy(labels_true)
-    entropy_K = entropy(labels_pred)
+    entropy_C = _entropy(labels_true)
+    entropy_K = _entropy(labels_pred)
 
     contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
     MI = mutual_info_score(None, None, contingency=contingency)
@@ -1042,7 +1043,7 @@ def adjusted_mutual_info_score(
     # Calculate the expected value for the mutual information
     emi = expected_mutual_information(contingency, n_samples)
     # Calculate entropy for each labeling
-    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
+    h_true, h_pred = _entropy(labels_true), _entropy(labels_pred)
     normalizer = _generalized_average(h_true, h_pred, average_method)
     denominator = normalizer - emi
     # Avoid 0.0 / 0.0 when expectation equals maximum, i.e. a perfect match.
@@ -1168,7 +1169,7 @@ def normalized_mutual_info_score(
         return 0.0
 
     # Calculate entropy for each labeling
-    h_true, h_pred = entropy(labels_true), entropy(labels_pred)
+    h_true, h_pred = _entropy(labels_true), _entropy(labels_pred)
 
     normalizer = _generalized_average(h_true, h_pred, average_method)
     return float(mi / normalizer)
@@ -1272,13 +1273,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, *, sparse="deprecated"):
     return float(np.sqrt(tk / pk) * np.sqrt(tk / qk)) if tk != 0.0 else 0.0
 
 
-@validate_params(
-    {
-        "labels": ["array-like"],
-    },
-    prefer_skip_nested_validation=True,
-)
-def entropy(labels):
+def _entropy(labels):
     """Calculate the entropy for a labeling.
 
     Parameters
@@ -1312,3 +1307,25 @@ def entropy(labels):
     # Always convert the result as a Python scalar (on CPU) instead of a device
     # specific scalar array.
     return float(-xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum))))
+
+
+# TODO(1.10): Remove
+@deprecated("`entropy` is deprecated in 1.8 and will be removed in 1.10.")
+def entropy(labels):
+    """Calculate the entropy for a labeling.
+
+    Parameters
+    ----------
+    labels : array-like of shape (n_samples,), dtype=int
+        The labels.
+
+    Returns
+    -------
+    entropy : float
+       The entropy for a labeling.
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+    """
+    return _entropy(labels)
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -10,7 +10,6 @@
     adjusted_rand_score,
     completeness_score,
     contingency_matrix,
-    entropy,
     expected_mutual_information,
     fowlkes_mallows_score,
     homogeneity_completeness_v_measure,
@@ -21,7 +20,12 @@
     rand_score,
     v_measure_score,
 )
-from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings
+from sklearn.metrics.cluster._supervised import (
+    _entropy,
+    _generalized_average,
+    check_clusterings,
+    entropy,
+)
 from sklearn.utils import assert_all_finite
 from sklearn.utils._array_api import (
     _get_namespace_device_dtype_ids,
@@ -267,10 +271,16 @@ def test_int_overflow_mutual_info_fowlkes_mallows_score():
     assert_all_finite(fowlkes_mallows_score(x, y))
 
 
+# TODO(1.10): Remove
+def test_public_entropy_deprecation():
+    with pytest.warns(FutureWarning, match="Function entropy is deprecated"):
+        entropy([0, 0, 42.0])
+
+
 def test_entropy():
-    assert_almost_equal(entropy([0, 0, 42.0]), 0.6365141, 5)
-    assert_almost_equal(entropy([]), 1)
-    assert entropy([1, 1, 1, 1]) == 0
+    assert_almost_equal(_entropy([0, 0, 42.0]), 0.6365141, 5)
+    assert_almost_equal(_entropy([]), 1)
+    assert _entropy([1, 1, 1, 1]) == 0
 
 
 @pytest.mark.parametrize(
@@ -284,9 +294,9 @@ def test_entropy_array_api(array_namespace, device, dtype_name):
     empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device)
     int_labels = xp.asarray([1, 1, 1, 1], device=device)
     with config_context(array_api_dispatch=True):
-        assert entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5)
-        assert entropy(empty_int32_labels) == 1
-        assert entropy(int_labels) == 0
+        assert _entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5)
+        assert _entropy(empty_int32_labels) == 1
+        assert _entropy(int_labels) == 0
 
 
 def test_contingency_matrix():
@@ -339,7 +349,7 @@ def test_v_measure_and_mutual_information(seed=36):
             v_measure_score(labels_a, labels_b),
             2.0
             * mutual_info_score(labels_a, labels_b)
-            / (entropy(labels_a) + entropy(labels_b)),
+            / (_entropy(labels_a) + _entropy(labels_b)),
             0,
         )
         avg = "arithmetic"
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py
@@ -223,7 +223,6 @@ def _check_function_param_validation(
     "sklearn.metrics.classification_report",
     "sklearn.metrics.cluster.adjusted_mutual_info_score",
     "sklearn.metrics.cluster.contingency_matrix",
-    "sklearn.metrics.cluster.entropy",
     "sklearn.metrics.cluster.fowlkes_mallows_score",
     "sklearn.metrics.cluster.homogeneity_completeness_v_measure",
     "sklearn.metrics.cluster.normalized_mutual_info_score",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+- :func:`metrics.cluster.entropy` is deprecated and will be removed in v1.10.
	`2`	+ By :user:`Lucy Liu <lucyleeow>`