Closed
Description
I think that we should expose the pos_label
as one of the parameters of plot_precision_recall_curve
. I even think that we should issue a warning in case of class imbalance and that the positive class considered is the one with most samples. In most of cases, you are reporting the wrong part of your result but it might be what we give as a result with defaults.
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_precision_recall_curve
X, y = fetch_openml(
name="blood-transfusion-service-center",
as_frame=True, return_X_y=True,
)
# Make columns and classes more human-readable
X.columns = ["Recency", "Frequency", "Monetary", "Time"]
y = y.apply(
lambda x: "donated" if x == "2" else "not donated"
).astype("category")
X_train, X_test, y_train, y_test = train_test_split(
X, y, shuffle=True, random_state=0, test_size=0.5
)
classifier = LogisticRegression().fit(X_train, y_train)
plot_precision_recall_curve(classifier, X_test, y_test)