Open
Description
Describe the workflow you want to enable
Scikit-learn defines three popular metrics for evaluating clustering performance when there are no ground-truth cluster labels: sklearn.metrics.silhouette_score, sklearn.metrics.calinski_harabasz_score and sklearn.metrics.davies_bouldin_score. But there are lots of others, and it's previously been discussed whether to integrate more into scikit-learn.
Describe your proposed solution
I've implemented four relatively popular ones, using the same interface and code style as in https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/metrics/cluster/_unsupervised.py. Would there be interest in integrating these into scikit-learn?
import numpy as np
from itertools import combinations
from sklearn.utils import check_X_y
from sklearn.metrics.cluster._unsupervised import check_number_of_labels
from sklearn.preprocessing import LabelEncoder
def log_ss_ratio(X, labels):
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = len(le.classes_)
check_number_of_labels(n_labels, n_samples)
extra_disp, intra_disp = 0.0, 0.0
mean = X.mean(axis=0)
for k in range(n_labels):
cluster_k = X[labels == k]
mean_k = cluster_k.mean(axis=0)
extra_disp += len(cluster_k) * ((mean_k - mean) ** 2).sum()
intra_disp += ((cluster_k - mean_k) ** 2).sum()
return np.log(extra_disp / intra_disp)
def ball_hall(X, labels):
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = len(le.classes_)
check_number_of_labels(n_labels, n_samples)
sum_mean_dispersions = 0
for k in range(n_labels):
cluster_k = X[labels == k]
mean_k = cluster_k.mean(axis=0)
intra_disp_cluster = ((cluster_k - mean_k) ** 2).sum()
sum_mean_dispersions += intra_disp_cluster / len(cluster_k)
return sum_mean_dispersions / n_labels
def banfeld_raftery(X, labels):
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = len(le.classes_)
check_number_of_labels(n_labels, n_samples)
br_index = 0
for k in range(n_labels):
cluster_k = X[labels == k]
mean_k = cluster_k.mean(axis=0)
intra_disp_cluster = ((cluster_k - mean_k) ** 2).sum()
br_index += len(cluster_k) * \
np.log(intra_disp_cluster / len(cluster_k))
return br_index
def ray_turi(X, labels):
X, labels = check_X_y(X, labels)
le = LabelEncoder()
labels = le.fit_transform(labels)
n_samples, _ = X.shape
n_labels = len(le.classes_)
check_number_of_labels(n_labels, n_samples)
intra_disp = 0.0
cluster_means = []
for k in range(n_labels):
cluster_k = X[labels == k]
mean_k = cluster_k.mean(axis=0)
intra_disp += ((cluster_k - mean_k) ** 2).sum()
cluster_means.append(mean_k)
min_cluster_mean_diff = min(
((mean_i - mean_j) ** 2).sum()
for mean_i, mean_j in combinations(cluster_means, 2))
return intra_disp / (min_cluster_mean_diff * n_samples)
Describe alternatives you've considered, if relevant
No response
Additional context
No response