Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions dask_ml/_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import six
from toolz import partial
import sklearn.utils

import dask
from dask.delayed import Delayed
Expand Down Expand Up @@ -113,14 +114,15 @@ def _partial_fit(model, x, y, kwargs=None):
return model


def fit(model, x, y, compute=True, **kwargs):
def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None,
**kwargs):
""" Fit scikit learn model against dask arrays

Model must support the ``partial_fit`` interface for online or batch
learning.

This method will be called on dask arrays in sequential order. Ideally
your rows are independent and identically distributed.
Ideally your rows are independent and identically distributed. By default,
this function will step through chunks of the arrays in random order.

Parameters
----------
Expand All @@ -130,6 +132,12 @@ def fit(model, x, y, compute=True, **kwargs):
Two dimensional array, likely tall and skinny
y: dask Array
One dimensional array with same chunks as x's rows
compute : bool
Whether to compute this result
shuffle_blocks : bool
Whether to shuffle the blocks with ``random_state`` or not
random_state : int or numpy.random.RandomState
Random state to use when shuffling blocks
kwargs:
options to pass to partial_fit

Expand Down Expand Up @@ -171,12 +179,17 @@ def fit(model, x, y, compute=True, **kwargs):
x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

nblocks = len(x.chunks[0])
order = list(range(nblocks))
if shuffle_blocks:
rng = sklearn.utils.check_random_state(random_state)
rng.shuffle(order)

name = 'fit-' + dask.base.tokenize(model, x, y, kwargs)
name = 'fit-' + dask.base.tokenize(model, x, y, kwargs, order)
dsk = {(name, -1): model}
dsk.update({(name, i): (_partial_fit, (name, i - 1),
(x.name, i, 0),
(getattr(y, 'name', ''), i), kwargs)
(x.name, order[i], 0),
(getattr(y, 'name', ''), order[i]),
kwargs)
for i in range(nblocks)})

new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {}))
Expand Down
18 changes: 17 additions & 1 deletion dask_ml/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,14 @@ class Incremental(ParallelPostFit):
a single NumPy array, which may exhaust the memory of your worker.
You probably want to always specify `scoring`.

random_state : int or numpy.random.RandomState, optional
Random object that determines how to shuffle blocks.

shuffle_blocks : bool, default True
Determines whether to call ``partial_fit`` on a randomly selected chunk
of the Dask arrays (default), or to fit in sequential order. This does
not control shuffle between blocks or shuffling each block.

Attributes
----------
estimator_ : Estimator
Expand Down Expand Up @@ -350,6 +358,11 @@ class Incremental(ParallelPostFit):
>>> gs = GridSearchCV(clf, param_grid)
>>> gs.fit(X, y, classes=[0, 1])
"""
def __init__(self, estimator=None, scoring=None, shuffle_blocks=True,
random_state=None):
self.shuffle_blocks = shuffle_blocks
self.random_state = random_state
super(Incremental, self).__init__(estimator=estimator, scoring=scoring)

@property
def _postfit_estimator(self):
Expand All @@ -361,7 +374,10 @@ def _fit_for_estimator(self, estimator, X, y, **fit_kwargs):
if not dask.is_dask_collection(X) and not dask.is_dask_collection(y):
result = estimator.partial_fit(X=X, y=y, **fit_kwargs)
else:
result = fit(estimator, X, y, **fit_kwargs)
result = fit(estimator, X, y,
random_state=self.random_state,
shuffle_blocks=self.shuffle_blocks,
**fit_kwargs)

copy_learned_attributes(result, self)
self.estimator_ = result
Expand Down
36 changes: 21 additions & 15 deletions tests/test_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from sklearn.linear_model import SGDClassifier

from dask_ml.wrappers import Incremental
from dask_ml.utils import assert_estimator_equal
import dask_ml.metrics
from dask_ml.metrics.scorer import check_scoring

Expand All @@ -31,43 +30,50 @@ def test_set_params():
assert result['scoring'] == 'accuracy'


def test_incremental_basic(scheduler, xy_classification):
X, y = xy_classification
def test_incremental_basic(scheduler):
# Create observations that we know linear models can recover
n, d = 100, 3
rng = da.random.RandomState(42)
X = rng.normal(size=(n, d), chunks=30)
coef_star = rng.uniform(size=d, chunks=d)
y = da.sign(X.dot(coef_star))
y = (y + 1) / 2

with scheduler() as (s, [_, _]):
est1 = SGDClassifier(random_state=0, tol=1e-3)
est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
est2 = clone(est1)

clf = Incremental(est1)
clf = Incremental(est1, random_state=0)
result = clf.fit(X, y, classes=[0, 1])
for slice_ in da.core.slices_from_chunks(X.chunks):
est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

assert result is clf

assert isinstance(result.estimator_.coef_, np.ndarray)
np.testing.assert_array_almost_equal(result.estimator_.coef_,
est2.coef_)
rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
rel_error /= np.linalg.norm(clf.coef_)
assert rel_error < 0.9

assert_estimator_equal(clf.estimator_, est2,
exclude=['loss_function_'])
assert set(dir(clf.estimator_)) == set(dir(est2))

# Predict
result = clf.predict(X)
expected = est2.predict(X)
assert isinstance(result, da.Array)
assert_eq(result, expected)
rel_error = np.linalg.norm(result - expected)
rel_error /= np.linalg.norm(expected)
assert rel_error < 0.2

# score
result = clf.score(X, y)
expected = est2.score(X, y)
# assert isinstance(result, da.Array)
assert_eq(result, expected)
assert abs(result - expected) < 0.1

clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
average=True))
clf.partial_fit(X, y, classes=[0, 1])
assert_estimator_equal(clf.estimator_, est2,
exclude=['loss_function_'])
assert set(dir(clf.estimator_)) == set(dir(est2))


def test_in_gridsearch(scheduler, xy_classification):
Expand Down
28 changes: 28 additions & 0 deletions tests/test_partial.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
import numpy as np
import dask
import dask.array as da
from dask_ml._partial import fit, predict
from dask_ml.datasets import make_classification
from dask_ml.wrappers import Incremental
import pytest


x = np.array([[1, 0],
Expand Down Expand Up @@ -49,3 +51,29 @@ def test_fit_rechunking():

clf = Incremental(SGDClassifier(max_iter=5))
clf.fit(X, y, classes=list(range(n_classes)))


def test_fit_shuffle_blocks():
N = 10
X = da.from_array(1 + np.arange(N).reshape(-1, 1), chunks=1)
y = da.from_array(np.ones(N), chunks=1)
classes = [0, 1]

sgd = SGDClassifier(max_iter=5, random_state=0, fit_intercept=False,
shuffle=False)

sgd1 = fit(clone(sgd), X, y, random_state=0, classes=classes)
sgd2 = fit(clone(sgd), X, y, random_state=42, classes=classes)
assert len(sgd1.coef_) == len(sgd2.coef_) == 1
assert not np.allclose(sgd1.coef_, sgd2.coef_)

X, y = make_classification(random_state=0, chunks=20)
sgd_a = fit(clone(sgd), X, y, random_state=0, classes=classes,
shuffle_blocks=False)
sgd_b = fit(clone(sgd), X, y, random_state=42, classes=classes,
shuffle_blocks=False)
assert np.allclose(sgd_a.coef_, sgd_b.coef_)

with pytest.raises(ValueError, match='cannot be used to seed'):
fit(sgd, X, y, classes=np.array([-1, 0, 1]),
shuffle_blocks=True, random_state=da.random.RandomState(42))