@@ -38,7 +38,7 @@ class BaseMultilayerPerceptron(six.with_metaclass(ABCMeta, BaseEstimator)):
38
38
def __init__ (self , hidden_layer_sizes , activation , algorithm ,
39
39
alpha , batch_size , learning_rate , learning_rate_init , power_t ,
40
40
max_iter , loss , shuffle , random_state , tol , verbose ,
41
- warm_start ):
41
+ warm_start , momentum ):
42
42
self .activation = activation
43
43
self .algorithm = algorithm
44
44
self .alpha = alpha
@@ -54,6 +54,7 @@ def __init__(self, hidden_layer_sizes, activation, algorithm,
54
54
self .tol = tol
55
55
self .verbose = verbose
56
56
self .warm_start = warm_start
57
+ self .momentum = momentum
57
58
58
59
self .layers_coef_ = None
59
60
self .layers_intercept_ = None
@@ -373,9 +374,65 @@ def _fit(self, X, y, incremental=False):
373
374
374
375
# Run the Stochastic Gradient Descent algorithm
375
376
if self .algorithm == 'sgd' :
376
- prev_cost = np . inf
377
- cost_increase_count = 0
377
+ self . _fit_sgd ( X , y , activations , deltas , coef_grads ,
378
+ intercept_grads , layer_units , incremental )
378
379
380
+ # Run the LBFGS algorithm
381
+ elif self .algorithm == 'l-bfgs' :
382
+ self ._fit_lbfgs (X , y , activations , deltas , coef_grads ,
383
+ intercept_grads , layer_units )
384
+ return self
385
+
386
+ def _fit_lbfgs (self , X , y , activations , deltas , coef_grads , intercept_grads ,
387
+ layer_units ):
388
+ # Store meta information for the parameters
389
+ self ._coef_indptr = []
390
+ self ._intercept_indptr = []
391
+ start = 0
392
+
393
+ # Save sizes and indices of coefficients for faster unpacking
394
+ for i in range (self .n_layers_ - 1 ):
395
+ n_fan_in , n_fan_out = layer_units [i ], layer_units [i + 1 ]
396
+
397
+ end = start + (n_fan_in * n_fan_out )
398
+ self ._coef_indptr .append ((start , end , (n_fan_in , n_fan_out )))
399
+ start = end
400
+
401
+ # Save sizes and indices of intercepts for faster unpacking
402
+ for i in range (self .n_layers_ - 1 ):
403
+ end = start + layer_units [i + 1 ]
404
+ self ._intercept_indptr .append ((start , end ))
405
+ start = end
406
+
407
+ # Run LBFGS
408
+ packed_coef_inter = _pack (self .layers_coef_ ,
409
+ self .layers_intercept_ )
410
+
411
+ if self .verbose is True or self .verbose >= 1 :
412
+ iprint = 1
413
+ else :
414
+ iprint = - 1
415
+
416
+ optimal_parameters , self .cost_ , d = fmin_l_bfgs_b (
417
+ x0 = packed_coef_inter ,
418
+ func = self ._cost_grad_lbfgs ,
419
+ maxfun = self .max_iter ,
420
+ iprint = iprint ,
421
+ pgtol = self .tol ,
422
+ args = (X , y , activations , deltas , coef_grads , intercept_grads ))
423
+
424
+ self ._unpack (optimal_parameters )
425
+
426
+ def _fit_sgd (self , X , y , activations , deltas , coef_grads , intercept_grads ,
427
+ layer_units , incremental ):
428
+ prev_cost = np .inf
429
+ cost_increase_count = 0
430
+ n_samples = X .shape [0 ]
431
+ batch_size = np .clip (self .batch_size , 1 , n_samples )
432
+ intercept_update_prev = [np .zeros_like (grads ) for grads in intercept_grads ]
433
+ coef_update_prev = [np .zeros_like (grads ) for grads in coef_grads ]
434
+
435
+ try :
379
436
for i in range (self .max_iter ):
380
437
for batch_slice in gen_batches (n_samples , batch_size ):
381
438
activations [0 ] = X [batch_slice ]
@@ -386,10 +443,13 @@ def _fit(self, X, y, incremental=False):
386
443
387
444
# update weights
388
445
for i in range (self .n_layers_ - 1 ):
389
- self .layers_coef_ [i ] -= (self .learning_rate_ *
390
- coef_grads [i ])
391
- self .layers_intercept_ [i ] -= (self .learning_rate_ *
392
- intercept_grads [i ])
446
+ coef_update_prev [i ] = ((1 - self .momentum ) * coef_grads [i ]
447
+ + self .momentum * coef_update_prev [i ])
448
+ self .layers_coef_ [i ] -= self .learning_rate_ * coef_update_prev [i ]
449
+
450
+ intercept_update_prev [i ] = ((1 - self .momentum ) * intercept_grads [i ]
451
+ + self .momentum * intercept_update_prev [i ])
452
+ self .layers_intercept_ [i ] -= self .learning_rate_ * intercept_update_prev [i ]
393
453
394
454
if self .learning_rate == 'invscaling' :
395
455
self .learning_rate_ = self .learning_rate_init / \
@@ -423,47 +483,8 @@ def _fit(self, X, y, incremental=False):
423
483
warnings .warn ('SGD: Maximum iterations have reached and'
424
484
' the optimization hasn\' t converged yet.'
425
485
% (), ConvergenceWarning )
426
- # Run the LBFGS algorithm
427
- elif self .algorithm == 'l-bfgs' :
428
- # Store meta information for the parameters
429
- self ._coef_indptr = []
430
- self ._intercept_indptr = []
431
- start = 0
432
-
433
- # Save sizes and indices of coefficients for faster unpacking
434
- for i in range (self .n_layers_ - 1 ):
435
- n_fan_in , n_fan_out = layer_units [i ], layer_units [i + 1 ]
436
-
437
- end = start + (n_fan_in * n_fan_out )
438
- self ._coef_indptr .append ((start , end , (n_fan_in , n_fan_out )))
439
- start = end
440
-
441
- # Save sizes and indices of intercepts for faster unpacking
442
- for i in range (self .n_layers_ - 1 ):
443
- end = start + layer_units [i + 1 ]
444
- self ._intercept_indptr .append ((start , end ))
445
- start = end
446
-
447
- # Run LBFGS
448
- packed_coef_inter = _pack (self .layers_coef_ ,
449
- self .layers_intercept_ )
450
-
451
- if self .verbose is True or self .verbose >= 1 :
452
- iprint = 1
453
- else :
454
- iprint = - 1
455
-
456
- optimal_parameters , self .cost_ , d = fmin_l_bfgs_b (
457
- x0 = packed_coef_inter ,
458
- func = self ._cost_grad_lbfgs ,
459
- maxfun = self .max_iter ,
460
- iprint = iprint ,
461
- pgtol = self .tol ,
462
- args = (X , y , activations , deltas , coef_grads , intercept_grads ))
463
-
464
- self ._unpack (optimal_parameters )
465
-
466
- return self
486
+ except KeyboardInterrupt :
487
+ pass
467
488
468
489
def fit (self , X , y ):
469
490
"""Fit the model to the data X and target y.
@@ -592,6 +613,8 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
592
613
each time step 't' using an inverse scaling exponent of 'power_t'.
593
614
learning_rate_ = learning_rate_init / pow(t, power_t)
594
615
616
+ Only used when algorithm='sgd'.
617
+
595
618
max_iter : int, optional, default 200
596
619
Maximum number of iterations. The algorithm
597
620
iterates until convergence (determined by 'tol') or
@@ -600,7 +623,7 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
600
623
random_state : int or RandomState, optional, default None
601
624
State of or seed for random number generator.
602
625
603
- shuffle : bool, optional, default False
626
+ shuffle : bool, optional, default True
604
627
Whether to shuffle samples in each iteration before extracting
605
628
minibatches.
606
629
@@ -611,12 +634,12 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
611
634
612
635
learning_rate_init : double, optional, default 0.5
613
636
The initial learning rate used. It controls the step-size
614
- in updating the weights.
637
+ in updating the weights. Only used when algorithm='sgd'.
615
638
616
639
power_t : double, optional, default 0.5
617
640
The exponent for inverse scaling learning rate.
618
641
It is used in updating learning_rate_init when the learning_rate
619
- is set to 'invscaling'.
642
+ is set to 'invscaling'. Only used when algorithm='sgd'.
620
643
621
644
verbose : bool, optional, default False
622
645
Whether to print progress messages to stdout.
@@ -626,6 +649,10 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
626
649
call to fit as initialization, otherwise, just erase the
627
650
previous solution.
628
651
652
+ momentum : float, default 0
653
+ Momentum for gradient descent update. Should be between 0 and 1. Only
654
+ used when algorithm='sgd'.
655
+
629
656
Attributes
630
657
----------
631
658
`classes_` : array or list of array of shape (n_classes,)
@@ -686,8 +713,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
686
713
algorithm = 'l-bfgs' , alpha = 0.00001 ,
687
714
batch_size = 200 , learning_rate = "constant" ,
688
715
learning_rate_init = 0.5 , power_t = 0.5 , max_iter = 200 ,
689
- shuffle = False , random_state = None , tol = 1e-5 ,
690
- verbose = False , warm_start = False ):
716
+ shuffle = True , random_state = None , tol = 1e-5 ,
717
+ verbose = False , warm_start = False , momentum = 0 ):
691
718
692
719
sup = super (MultilayerPerceptronClassifier , self )
693
720
sup .__init__ (hidden_layer_sizes = hidden_layer_sizes ,
@@ -696,7 +723,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
696
723
learning_rate_init = learning_rate_init , power_t = power_t ,
697
724
max_iter = max_iter , loss = 'log_loss' , shuffle = shuffle ,
698
725
random_state = random_state , tol = tol ,
699
- verbose = verbose , warm_start = warm_start )
726
+ verbose = verbose , warm_start = warm_start , momentum = momentum )
700
727
701
728
self .label_binarizer_ = LabelBinarizer ()
702
729
@@ -859,6 +886,8 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
859
886
each time step 't' using an inverse scaling exponent of 'power_t'.
860
887
learning_rate_ = learning_rate_init / pow(t, power_t)
861
888
889
+ Only used when algorithm='sgd'.
890
+
862
891
max_iter : int, optional, default 200
863
892
Maximum number of iterations. The algorithm
864
893
iterates until convergence (determined by 'tol') or
@@ -867,9 +896,9 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
867
896
random_state : int or RandomState, optional, default None
868
897
State of or seed for random number generator.
869
898
870
- shuffle : bool, optional, default False
899
+ shuffle : bool, optional, default True
871
900
Whether to shuffle samples in each iteration before extracting
872
- minibatches.
901
+ minibatches. Only used when algorithm='sgd'.
873
902
874
903
tol : float, optional, default 1e-5
875
904
Tolerance for the optimization. When the loss at iteration i+1 differs
@@ -878,12 +907,12 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
878
907
879
908
learning_rate_init : double, optional, default 0.5
880
909
The initial learning rate used. It controls the step-size
881
- in updating the weights.
910
+ in updating the weights. Only used when algorithm='sgd'.
882
911
883
912
power_t : double, optional, default 0.5
884
913
The exponent for inverse scaling learning rate.
885
914
It is used for updating learning_rate_ when it
886
- is set to 'invscaling'.
915
+ is set to 'invscaling'. Only used when algorithm='sgd'.
887
916
888
917
verbose : bool, optional, default False
889
918
Whether to print progress messages to stdout.
@@ -893,6 +922,10 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
893
922
call to fit as initialization, otherwise, just erase the
894
923
previous solution.
895
924
925
+ momentum : float, default 0
926
+ Momentum for gradient descent update. Should be between 0 and 1. Only
927
+ used when algorithm='sgd'.
928
+
896
929
Attributes
897
930
----------
898
931
`cost_` : float
@@ -947,9 +980,9 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
947
980
algorithm = 'l-bfgs' , alpha = 0.00001 ,
948
981
batch_size = 200 , learning_rate = "constant" ,
949
982
learning_rate_init = 0.1 ,
950
- power_t = 0.5 , max_iter = 100 , shuffle = False ,
983
+ power_t = 0.5 , max_iter = 100 , shuffle = True ,
951
984
random_state = None , tol = 1e-5 ,
952
- verbose = False , warm_start = False ):
985
+ verbose = False , warm_start = False , momentum = 0 ):
953
986
954
987
sup = super (MultilayerPerceptronRegressor , self )
955
988
sup .__init__ (hidden_layer_sizes = hidden_layer_sizes ,
@@ -958,7 +991,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
958
991
learning_rate_init = learning_rate_init , power_t = power_t ,
959
992
max_iter = max_iter , loss = 'squared_loss' , shuffle = shuffle ,
960
993
random_state = random_state , tol = tol ,
961
- verbose = verbose , warm_start = warm_start )
994
+ verbose = verbose , warm_start = warm_start , momentum = momentum )
962
995
963
996
def predict (self , X ):
964
997
"""Predict using the multi-layer perceptron model.
0 commit comments