diff --git a/dask_ml/_partial.py b/dask_ml/_partial.py index 659b00d64..4a196fce6 100644 --- a/dask_ml/_partial.py +++ b/dask_ml/_partial.py @@ -9,6 +9,7 @@ import numpy as np import six from toolz import partial +import sklearn.utils import dask from dask.delayed import Delayed @@ -113,14 +114,15 @@ def _partial_fit(model, x, y, kwargs=None): return model -def fit(model, x, y, compute=True, **kwargs): +def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None, + **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. - This method will be called on dask arrays in sequential order. Ideally - your rows are independent and identically distributed. + Ideally your rows are independent and identically distributed. By default, + this function will step through chunks of the arrays in random order. Parameters ---------- @@ -130,6 +132,12 @@ def fit(model, x, y, compute=True, **kwargs): Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows + compute : bool + Whether to compute this result + shuffle_blocks : bool + Whether to shuffle the blocks with ``random_state`` or not + random_state : int or numpy.random.RandomState + Random state to use when shuffling blocks kwargs: options to pass to partial_fit @@ -171,12 +179,17 @@ def fit(model, x, y, compute=True, **kwargs): x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) + order = list(range(nblocks)) + if shuffle_blocks: + rng = sklearn.utils.check_random_state(random_state) + rng.shuffle(order) - name = 'fit-' + dask.base.tokenize(model, x, y, kwargs) + name = 'fit-' + dask.base.tokenize(model, x, y, kwargs, order) dsk = {(name, -1): model} dsk.update({(name, i): (_partial_fit, (name, i - 1), - (x.name, i, 0), - (getattr(y, 'name', ''), i), kwargs) + (x.name, order[i], 0), + (getattr(y, 'name', ''), order[i]), + kwargs) for i in range(nblocks)}) new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {})) diff --git a/dask_ml/wrappers.py b/dask_ml/wrappers.py index 1b635bf35..2f88bee4b 100644 --- a/dask_ml/wrappers.py +++ b/dask_ml/wrappers.py @@ -323,6 +323,14 @@ class Incremental(ParallelPostFit): a single NumPy array, which may exhaust the memory of your worker. You probably want to always specify `scoring`. + random_state : int or numpy.random.RandomState, optional + Random object that determines how to shuffle blocks. + + shuffle_blocks : bool, default True + Determines whether to call ``partial_fit`` on a randomly selected chunk + of the Dask arrays (default), or to fit in sequential order. This does + not control shuffle between blocks or shuffling each block. + Attributes ---------- estimator_ : Estimator @@ -350,6 +358,11 @@ class Incremental(ParallelPostFit): >>> gs = GridSearchCV(clf, param_grid) >>> gs.fit(X, y, classes=[0, 1]) """ + def __init__(self, estimator=None, scoring=None, shuffle_blocks=True, + random_state=None): + self.shuffle_blocks = shuffle_blocks + self.random_state = random_state + super(Incremental, self).__init__(estimator=estimator, scoring=scoring) @property def _postfit_estimator(self): @@ -361,7 +374,10 @@ def _fit_for_estimator(self, estimator, X, y, **fit_kwargs): if not dask.is_dask_collection(X) and not dask.is_dask_collection(y): result = estimator.partial_fit(X=X, y=y, **fit_kwargs) else: - result = fit(estimator, X, y, **fit_kwargs) + result = fit(estimator, X, y, + random_state=self.random_state, + shuffle_blocks=self.shuffle_blocks, + **fit_kwargs) copy_learned_attributes(result, self) self.estimator_ = result diff --git a/tests/test_incremental.py b/tests/test_incremental.py index cd3ed2e17..a04c3aed0 100644 --- a/tests/test_incremental.py +++ b/tests/test_incremental.py @@ -8,7 +8,6 @@ from sklearn.linear_model import SGDClassifier from dask_ml.wrappers import Incremental -from dask_ml.utils import assert_estimator_equal import dask_ml.metrics from dask_ml.metrics.scorer import check_scoring @@ -31,14 +30,20 @@ def test_set_params(): assert result['scoring'] == 'accuracy' -def test_incremental_basic(scheduler, xy_classification): - X, y = xy_classification +def test_incremental_basic(scheduler): + # Create observations that we know linear models can recover + n, d = 100, 3 + rng = da.random.RandomState(42) + X = rng.normal(size=(n, d), chunks=30) + coef_star = rng.uniform(size=d, chunks=d) + y = da.sign(X.dot(coef_star)) + y = (y + 1) / 2 with scheduler() as (s, [_, _]): - est1 = SGDClassifier(random_state=0, tol=1e-3) + est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) - clf = Incremental(est1) + clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) @@ -46,28 +51,29 @@ def test_incremental_basic(scheduler, xy_classification): assert result is clf assert isinstance(result.estimator_.coef_, np.ndarray) - np.testing.assert_array_almost_equal(result.estimator_.coef_, - est2.coef_) + rel_error = np.linalg.norm(clf.coef_ - est2.coef_) + rel_error /= np.linalg.norm(clf.coef_) + assert rel_error < 0.9 - assert_estimator_equal(clf.estimator_, est2, - exclude=['loss_function_']) + assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) - assert_eq(result, expected) + rel_error = np.linalg.norm(result - expected) + rel_error /= np.linalg.norm(expected) + assert rel_error < 0.2 # score result = clf.score(X, y) expected = est2.score(X, y) - # assert isinstance(result, da.Array) - assert_eq(result, expected) + assert abs(result - expected) < 0.1 - clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) + clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, + average=True)) clf.partial_fit(X, y, classes=[0, 1]) - assert_estimator_equal(clf.estimator_, est2, - exclude=['loss_function_']) + assert set(dir(clf.estimator_)) == set(dir(est2)) def test_in_gridsearch(scheduler, xy_classification): diff --git a/tests/test_partial.py b/tests/test_partial.py index 9d4151a43..c5eb88444 100644 --- a/tests/test_partial.py +++ b/tests/test_partial.py @@ -1,10 +1,12 @@ from sklearn.linear_model import SGDClassifier +from sklearn.base import clone import numpy as np import dask import dask.array as da from dask_ml._partial import fit, predict from dask_ml.datasets import make_classification from dask_ml.wrappers import Incremental +import pytest x = np.array([[1, 0], @@ -49,3 +51,29 @@ def test_fit_rechunking(): clf = Incremental(SGDClassifier(max_iter=5)) clf.fit(X, y, classes=list(range(n_classes))) + + +def test_fit_shuffle_blocks(): + N = 10 + X = da.from_array(1 + np.arange(N).reshape(-1, 1), chunks=1) + y = da.from_array(np.ones(N), chunks=1) + classes = [0, 1] + + sgd = SGDClassifier(max_iter=5, random_state=0, fit_intercept=False, + shuffle=False) + + sgd1 = fit(clone(sgd), X, y, random_state=0, classes=classes) + sgd2 = fit(clone(sgd), X, y, random_state=42, classes=classes) + assert len(sgd1.coef_) == len(sgd2.coef_) == 1 + assert not np.allclose(sgd1.coef_, sgd2.coef_) + + X, y = make_classification(random_state=0, chunks=20) + sgd_a = fit(clone(sgd), X, y, random_state=0, classes=classes, + shuffle_blocks=False) + sgd_b = fit(clone(sgd), X, y, random_state=42, classes=classes, + shuffle_blocks=False) + assert np.allclose(sgd_a.coef_, sgd_b.coef_) + + with pytest.raises(ValueError, match='cannot be used to seed'): + fit(sgd, X, y, classes=np.array([-1, 0, 1]), + shuffle_blocks=True, random_state=da.random.RandomState(42))