Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/cluster/k_means_.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ def predict(self, X):
x_squared_norms = _squared_norms(X)
return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]

def score(self, X):
def score(self, X, y=None):
"""Opposite of the value of X on the K-means objective.

Parameters
Expand Down
24 changes: 5 additions & 19 deletions sklearn/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,11 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func,
else:
this_score = clf.score(X_test, y_test)

if y is not None:
if hasattr(y, 'shape'):
this_n_test_samples = y.shape[0]
else:
this_n_test_samples = len(y)
if hasattr(X, 'shape'):
this_n_test_samples = X.shape[0]
else:
if hasattr(X, 'shape'):
this_n_test_samples = X.shape[0]
else:
this_n_test_samples = len(X)
this_n_test_samples = len(X)

if verbose > 2:
msg += ", score=%f" % this_score
if verbose > 1:
Expand Down Expand Up @@ -424,8 +419,6 @@ def _fit(self, X, y):
best_score = score
best_params = params

if best_score is None:
raise ValueError('Best score could not be found')
self.best_score_ = best_score
self.best_params_ = best_params

Expand All @@ -447,11 +440,4 @@ def _fit(self, X, y):
return self

def score(self, X, y=None):
if hasattr(self.best_estimator_, 'score'):
return self.best_estimator_.score(X, y)
if self.score_func is None:
raise ValueError("No score function explicitly defined, "
"and the estimator doesn't provide one %s"
% self.best_estimator_)
y_predicted = self.predict(X)
return self.score_func(y, y_predicted)
return self.best_estimator_.score(X, y)
17 changes: 16 additions & 1 deletion sklearn/tests/test_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@

from sklearn.base import BaseEstimator
from sklearn.grid_search import GridSearchCV
from sklearn.datasets.samples_generator import make_classification
from sklearn.datasets import make_classification, make_blobs
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import f1_score, precision_score
from sklearn.cross_validation import KFold
from sklearn.cluster import KMeans


class MockClassifier(BaseEstimator):
Expand Down Expand Up @@ -225,3 +226,17 @@ def test_X_as_list():
cv = KFold(n=len(X), n_folds=3)
grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
grid_search.fit(X.tolist(), y).score(X, y)


def test_unsupervised():
"""Use GridSearch for k in kmeans.

The scoreing function is not really helpful,
so this is a smoke test for long-forgotten API.
"""
X, y = make_blobs()
param_grid = dict(n_clusters=np.arange(1, 5))

# be verbose for that extra test coverage
grid_search = GridSearchCV(KMeans(), param_grid, verbose=2)
grid_search.fit(X)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it makes sense to use the inertia (the default score method for KMeans) as a way to select the number of clusters. inertia is to be minimized for an a priori fixed number of clusters. If the number of clusters increases, inertia will always decrease hence the best model will always be n_clusters=5 in this case, whatever the data.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know. See the discussion with @larsmans that github doesn't show here.

This is just a smoke test.
I could remove the case of unsupervised grid-search altogether.
I don't see a better way of testing it.