dask · TomAugspurger · Jul 10, 2018 · Jul 3, 2018 · Jul 3, 2018 · Jul 3, 2018
diff --git a/dask_ml/_partial.py b/dask_ml/_partial.py
@@ -9,6 +9,7 @@
 import numpy as np
 import six
 from toolz import partial
+import sklearn.utils
 
 import dask
 from dask.delayed import Delayed
@@ -113,14 +114,15 @@ def _partial_fit(model, x, y, kwargs=None):
     return model
 
 
-def fit(model, x, y, compute=True, **kwargs):
+def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None,
+        **kwargs):
     """ Fit scikit learn model against dask arrays
 
     Model must support the ``partial_fit`` interface for online or batch
     learning.
 
-    This method will be called on dask arrays in sequential order.  Ideally
-    your rows are independent and identically distributed.
+    Ideally your rows are independent and identically distributed. By default,
+    this function will step through chunks of the arrays in random order.
 
     Parameters
     ----------
@@ -130,6 +132,12 @@ def fit(model, x, y, compute=True, **kwargs):
         Two dimensional array, likely tall and skinny
     y: dask Array
         One dimensional array with same chunks as x's rows
+    compute : bool
+        Whether to compute this result
+    shuffle_blocks : bool
+        Whether to shuffle the blocks with ``random_state`` or not
+    random_state : int or numpy.random.RandomState
+        Random state to use when shuffling blocks
     kwargs:
         options to pass to partial_fit
 
@@ -171,12 +179,17 @@ def fit(model, x, y, compute=True, **kwargs):
         x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))
 
     nblocks = len(x.chunks[0])
+    order = list(range(nblocks))
+    if shuffle_blocks:
+        rng = sklearn.utils.check_random_state(random_state)
+        rng.shuffle(order)
 
-    name = 'fit-' + dask.base.tokenize(model, x, y, kwargs)
+    name = 'fit-' + dask.base.tokenize(model, x, y, kwargs, order)
     dsk = {(name, -1): model}
     dsk.update({(name, i): (_partial_fit, (name, i - 1),
-                                          (x.name, i, 0),
-                                          (getattr(y, 'name', ''), i), kwargs)
+                                          (x.name, order[i], 0),
+                                          (getattr(y, 'name', ''), order[i]),
+                            kwargs)
                 for i in range(nblocks)})
 
     new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {}))

diff --git a/dask_ml/wrappers.py b/dask_ml/wrappers.py
@@ -323,6 +323,14 @@ class Incremental(ParallelPostFit):
            a single NumPy array, which may exhaust the memory of your worker.
            You probably want to always specify `scoring`.
 
+    random_state : int or numpy.random.RandomState, optional
+        Random object that determines how to shuffle blocks.
+
+    shuffle_blocks : bool, default True
+        Determines whether to call ``partial_fit`` on a randomly selected chunk
+        of the Dask arrays (default), or to fit in sequential order. This does
+        not control shuffle between blocks or shuffling each block.
+
     Attributes
     ----------
     estimator_ : Estimator
@@ -350,6 +358,11 @@ class Incremental(ParallelPostFit):
     >>> gs = GridSearchCV(clf, param_grid)
     >>> gs.fit(X, y, classes=[0, 1])
     """
+    def __init__(self, estimator=None, scoring=None, shuffle_blocks=True,
+                 random_state=None):
+        self.shuffle_blocks = shuffle_blocks
+        self.random_state = random_state
+        super(Incremental, self).__init__(estimator=estimator, scoring=scoring)
 
     @property
     def _postfit_estimator(self):
@@ -361,7 +374,10 @@ def _fit_for_estimator(self, estimator, X, y, **fit_kwargs):
         if not dask.is_dask_collection(X) and not dask.is_dask_collection(y):
             result = estimator.partial_fit(X=X, y=y, **fit_kwargs)
         else:
-            result = fit(estimator, X, y, **fit_kwargs)
+            result = fit(estimator, X, y,
+                         random_state=self.random_state,
+                         shuffle_blocks=self.shuffle_blocks,
+                         **fit_kwargs)
 
         copy_learned_attributes(result, self)
         self.estimator_ = result

diff --git a/tests/test_incremental.py b/tests/test_incremental.py
@@ -8,7 +8,6 @@
 from sklearn.linear_model import SGDClassifier
 
 from dask_ml.wrappers import Incremental
-from dask_ml.utils import assert_estimator_equal
 import dask_ml.metrics
 from dask_ml.metrics.scorer import check_scoring
 
@@ -31,43 +30,50 @@ def test_set_params():
     assert result['scoring'] == 'accuracy'
 
 
-def test_incremental_basic(scheduler, xy_classification):
-    X, y = xy_classification
+def test_incremental_basic(scheduler):
+    # Create observations that we know linear models can recover
+    n, d = 100, 3
+    rng = da.random.RandomState(42)
+    X = rng.normal(size=(n, d), chunks=30)
+    coef_star = rng.uniform(size=d, chunks=d)
+    y = da.sign(X.dot(coef_star))
+    y = (y + 1) / 2
 
     with scheduler() as (s, [_, _]):
-        est1 = SGDClassifier(random_state=0, tol=1e-3)
+        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
         est2 = clone(est1)
 
-        clf = Incremental(est1)
+        clf = Incremental(est1, random_state=0)
         result = clf.fit(X, y, classes=[0, 1])
         for slice_ in da.core.slices_from_chunks(X.chunks):
             est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])
 
         assert result is clf
 
         assert isinstance(result.estimator_.coef_, np.ndarray)
-        np.testing.assert_array_almost_equal(result.estimator_.coef_,
-                                             est2.coef_)
+        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
+        rel_error /= np.linalg.norm(clf.coef_)
+        assert rel_error < 0.9
 
-        assert_estimator_equal(clf.estimator_, est2,
-                               exclude=['loss_function_'])
+        assert set(dir(clf.estimator_)) == set(dir(est2))
 
         #  Predict
         result = clf.predict(X)
         expected = est2.predict(X)
         assert isinstance(result, da.Array)
-        assert_eq(result, expected)
+        rel_error = np.linalg.norm(result - expected)
+        rel_error /= np.linalg.norm(expected)
+        assert rel_error < 0.2
 
         # score
         result = clf.score(X, y)
         expected = est2.score(X, y)
-        # assert isinstance(result, da.Array)
-        assert_eq(result, expected)
+        assert abs(result - expected) < 0.1
 
-        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
+        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
+                                        average=True))
         clf.partial_fit(X, y, classes=[0, 1])
-        assert_estimator_equal(clf.estimator_, est2,
-                               exclude=['loss_function_'])
+        assert set(dir(clf.estimator_)) == set(dir(est2))
 
 
 def test_in_gridsearch(scheduler, xy_classification):

diff --git a/tests/test_partial.py b/tests/test_partial.py
@@ -1,10 +1,12 @@
 from sklearn.linear_model import SGDClassifier
+from sklearn.base import clone
 import numpy as np
 import dask
 import dask.array as da
 from dask_ml._partial import fit, predict
 from dask_ml.datasets import make_classification
 from dask_ml.wrappers import Incremental
+import pytest
 
 
 x = np.array([[1, 0],
@@ -49,3 +51,29 @@ def test_fit_rechunking():
 
     clf = Incremental(SGDClassifier(max_iter=5))
     clf.fit(X, y, classes=list(range(n_classes)))
+
+
+def test_fit_shuffle_blocks():
+    N = 10
+    X = da.from_array(1 + np.arange(N).reshape(-1, 1), chunks=1)
+    y = da.from_array(np.ones(N), chunks=1)
+    classes = [0, 1]
+
+    sgd = SGDClassifier(max_iter=5, random_state=0, fit_intercept=False,
+                        shuffle=False)
+
+    sgd1 = fit(clone(sgd), X, y, random_state=0, classes=classes)
+    sgd2 = fit(clone(sgd), X, y, random_state=42, classes=classes)
+    assert len(sgd1.coef_) == len(sgd2.coef_) == 1
+    assert not np.allclose(sgd1.coef_, sgd2.coef_)
+
+    X, y = make_classification(random_state=0, chunks=20)
+    sgd_a = fit(clone(sgd), X, y, random_state=0, classes=classes,
+                shuffle_blocks=False)
+    sgd_b = fit(clone(sgd), X, y, random_state=42, classes=classes,
+                shuffle_blocks=False)
+    assert np.allclose(sgd_a.coef_, sgd_b.coef_)
+
+    with pytest.raises(ValueError, match='cannot be used to seed'):
+        fit(sgd, X, y, classes=np.array([-1, 0, 1]),
+            shuffle_blocks=True, random_state=da.random.RandomState(42))