diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py
index 14d9af4d371d2..d5c5cc1759226 100644
--- a/examples/compose/plot_column_transformer.py
+++ b/examples/compose/plot_column_transformer.py
@@ -42,7 +42,7 @@
 from sklearn.svm import LinearSVC
 
 
-class TextStats(BaseEstimator, TransformerMixin):
+class TextStats(TransformerMixin, BaseEstimator):
     """Extract features from each document for DictVectorizer"""
 
     def fit(self, x, y=None):
@@ -54,7 +54,7 @@ def transform(self, posts):
                 for text in posts]
 
 
-class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
+class SubjectBodyExtractor(TransformerMixin, BaseEstimator):
     """Extract the subject & body from a usenet post in a single pass.
 
     Takes a sequence of strings and produces a dict of sequences.  Keys are
diff --git a/sklearn/base.py b/sklearn/base.py
index dfc334a0efdc3..4866fc2536438 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -129,17 +129,6 @@ def _pprint(params, offset=0, printer=repr):
     return lines
 
 
-def _update_if_consistent(dict1, dict2):
-    common_keys = set(dict1.keys()).intersection(dict2.keys())
-    for key in common_keys:
-        if dict1[key] != dict2[key]:
-            raise TypeError("Inconsistent values for tag {}: {} != {}".format(
-                key, dict1[key], dict2[key]
-            ))
-    dict1.update(dict2)
-    return dict1
-
-
 class BaseEstimator:
     """Base class for all estimators in scikit-learn
 
@@ -320,20 +309,19 @@ def __setstate__(self, state):
         except AttributeError:
             self.__dict__.update(state)
 
+    def _more_tags(self):
+        return _DEFAULT_TAGS
+
     def _get_tags(self):
         collected_tags = {}
-        for base_class in inspect.getmro(self.__class__):
-            if (hasattr(base_class, '_more_tags')
-                    and base_class != self.__class__):
+        for base_class in reversed(inspect.getmro(self.__class__)):
+            if hasattr(base_class, '_more_tags'):
+                # need the if because mixins might not have _more_tags
+                # but might do redundant work in estimators
+                # (i.e. calling more tags on BaseEstimator multiple times)
                 more_tags = base_class._more_tags(self)
-                collected_tags = _update_if_consistent(collected_tags,
-                                                       more_tags)
-        if hasattr(self, '_more_tags'):
-            more_tags = self._more_tags()
-            collected_tags = _update_if_consistent(collected_tags, more_tags)
-        tags = _DEFAULT_TAGS.copy()
-        tags.update(collected_tags)
-        return tags
+                collected_tags.update(more_tags)
+        return collected_tags
 
 
 class ClassifierMixin:
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b88a8b8eb37ef..d19a0d8ead5a5 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -465,7 +465,7 @@ def grad(AB):
     return AB_[0], AB_[1]
 
 
-class _SigmoidCalibration(BaseEstimator, RegressorMixin):
+class _SigmoidCalibration(RegressorMixin, BaseEstimator):
     """Sigmoid regression model.
 
     Attributes
diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
index 89c6ce9fe8b34..4806afee90d1b 100644
--- a/sklearn/cluster/affinity_propagation_.py
+++ b/sklearn/cluster/affinity_propagation_.py
@@ -233,7 +233,7 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
 
 ###############################################################################
 
-class AffinityPropagation(BaseEstimator, ClusterMixin):
+class AffinityPropagation(ClusterMixin, BaseEstimator):
     """Perform Affinity Propagation Clustering of data.
 
     Read more in the :ref:`User Guide <affinity_propagation>`.
diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py
index 559bd515411f0..d841257f2a415 100644
--- a/sklearn/cluster/bicluster.py
+++ b/sklearn/cluster/bicluster.py
@@ -84,7 +84,7 @@ def _log_normalize(X):
     return L - row_avg - col_avg + avg
 
 
-class BaseSpectral(BaseEstimator, BiclusterMixin, metaclass=ABCMeta):
+class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for spectral biclustering."""
 
     @abstractmethod
diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py
index 11bb0f17a1dc6..edf5034fc08f8 100644
--- a/sklearn/cluster/birch.py
+++ b/sklearn/cluster/birch.py
@@ -319,7 +319,7 @@ def radius(self):
             self.sq_norm_)
 
 
-class Birch(BaseEstimator, TransformerMixin, ClusterMixin):
+class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
     """Implements the Birch clustering algorithm.
 
     It is a memory-efficient, online-learning algorithm provided as an
diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
index 9f4a55d3ad5b3..c123d22ff01f4 100644
--- a/sklearn/cluster/dbscan_.py
+++ b/sklearn/cluster/dbscan_.py
@@ -190,7 +190,7 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
     return np.where(core_samples)[0], labels
 
 
-class DBSCAN(BaseEstimator, ClusterMixin):
+class DBSCAN(ClusterMixin, BaseEstimator):
     """Perform DBSCAN clustering from vector array or distance matrix.
 
     DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index edf4dae76cd49..36ccf95253e96 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -652,7 +652,7 @@ def _hc_cut(n_clusters, children, n_leaves):
 
 ###############################################################################
 
-class AgglomerativeClustering(BaseEstimator, ClusterMixin):
+class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     """
     Agglomerative Clustering
 
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 65c5c26381d7d..a83df9c836b86 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -761,7 +761,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
     return centers
 
 
-class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
+class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     """K-Means clustering
 
     Read more in the :ref:`User Guide <k_means>`.
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index e588ccd6df1c8..6cccff6bddf18 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -293,7 +293,7 @@ def get_bin_seeds(X, bin_size, min_bin_freq=1):
     return bin_seeds
 
 
-class MeanShift(BaseEstimator, ClusterMixin):
+class MeanShift(ClusterMixin, BaseEstimator):
     """Mean shift clustering using a flat kernel.
 
     Mean shift clustering aims to discover "blobs" in a smooth density of
diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
index ecf5fa6a2bcc0..5d74658f3f524 100755
--- a/sklearn/cluster/optics_.py
+++ b/sklearn/cluster/optics_.py
@@ -21,7 +21,7 @@
 from ..metrics import pairwise_distances
 
 
-class OPTICS(BaseEstimator, ClusterMixin):
+class OPTICS(ClusterMixin, BaseEstimator):
     """Estimate clustering structure from vector array
 
     OPTICS (Ordering Points To Identify the Clustering Structure), closely
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index 0398ec0df006f..d5016f3456735 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -272,7 +272,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None,
     return labels
 
 
-class SpectralClustering(BaseEstimator, ClusterMixin):
+class SpectralClustering(ClusterMixin, BaseEstimator):
     """Apply clustering to a projection of the normalized Laplacian.
 
     In practice Spectral Clustering is very useful when the structure of
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 152be6b549c75..1d88769f238aa 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -24,7 +24,7 @@
 from sklearn.datasets import make_biclusters, make_checkerboard
 
 
-class MockBiclustering(BaseEstimator, BiclusterMixin):
+class MockBiclustering(BiclusterMixin, BaseEstimator):
     # Mock object for testing get_submatrix.
     def __init__(self):
         pass
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index fb46d07d172f6..eafd901e2b5ca 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -33,7 +33,7 @@
                      "item instead of a scalar.")
 
 
-class ColumnTransformer(_BaseComposition, TransformerMixin):
+class ColumnTransformer(TransformerMixin, _BaseComposition):
     """Applies transformers to columns of an array or pandas DataFrame.
 
     This estimator allows different columns or column subsets of the input
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 35b7ed6af962a..ce0c76d6486c5 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -14,7 +14,7 @@
 __all__ = ['TransformedTargetRegressor']
 
 
-class TransformedTargetRegressor(BaseEstimator, RegressorMixin):
+class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
     """Meta-estimator to regress on a transformed target.
 
     Useful for applying a non-linear transformation in regression
diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py
index fcbf92e2a44ea..cab28f406c5f9 100644
--- a/sklearn/compose/tests/test_target.py
+++ b/sklearn/compose/tests/test_target.py
@@ -226,7 +226,7 @@ def func(y):
     assert_allclose(y_pred_1d_func, y_pred_2d_func)
 
 
-class DummyCheckerArrayTransformer(BaseEstimator, TransformerMixin):
+class DummyCheckerArrayTransformer(TransformerMixin, BaseEstimator):
 
     def fit(self, X, y=None):
         assert isinstance(X, np.ndarray)
@@ -268,7 +268,7 @@ def test_transform_target_regressor_ensure_y_array():
         tt.predict(X)
 
 
-class DummyTransformer(BaseEstimator, TransformerMixin):
+class DummyTransformer(TransformerMixin, BaseEstimator):
     """Dummy transformer which count how many time fit was called."""
     def __init__(self, fit_counter=0):
         self.fit_counter = fit_counter
diff --git a/sklearn/covariance/elliptic_envelope.py b/sklearn/covariance/elliptic_envelope.py
index aa5e01ffa14b0..5ee4cdeeef96d 100644
--- a/sklearn/covariance/elliptic_envelope.py
+++ b/sklearn/covariance/elliptic_envelope.py
@@ -9,7 +9,7 @@
 from ..base import OutlierMixin
 
 
-class EllipticEnvelope(MinCovDet, OutlierMixin):
+class EllipticEnvelope(OutlierMixin, MinCovDet):
     """An object for detecting outliers in a Gaussian distributed dataset.
 
     Read more in the :ref:`User Guide <outlier_detection>`.
diff --git a/sklearn/cross_decomposition/cca_.py b/sklearn/cross_decomposition/cca_.py
index abff4bbecc588..658ba1fa7e91f 100644
--- a/sklearn/cross_decomposition/cca_.py
+++ b/sklearn/cross_decomposition/cca_.py
@@ -4,7 +4,7 @@
 __all__ = ['CCA']
 
 
-class CCA(_PLS, _UnstableArchMixin):
+class CCA(_UnstableArchMixin, _PLS):
     """CCA Canonical Correlation Analysis.
 
     CCA inherits from PLS with mode="B" and deflation_mode="canonical".
diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
index af45d4fa53a09..c1eb72df11607 100644
--- a/sklearn/cross_decomposition/pls_.py
+++ b/sklearn/cross_decomposition/pls_.py
@@ -121,7 +121,7 @@ def _center_scale_xy(X, Y, scale=True):
     return X, Y, x_mean, y_mean, x_std, y_std
 
 
-class _PLS(BaseEstimator, TransformerMixin, RegressorMixin, MultiOutputMixin,
+class _PLS(TransformerMixin, RegressorMixin, MultiOutputMixin, BaseEstimator,
            metaclass=ABCMeta):
     """Partial Least Squares (PLS)
 
@@ -750,7 +750,7 @@ def __init__(self, n_components=2, scale=True, algorithm="nipals",
             max_iter=max_iter, tol=tol, copy=copy)
 
 
-class PLSSVD(BaseEstimator, TransformerMixin):
+class PLSSVD(TransformerMixin, BaseEstimator):
     """Partial Least Square SVD
 
     Simply perform a svd on the crosscovariance matrix: X'Y
diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py
index 2f11d8bd847b8..e89a05051404b 100644
--- a/sklearn/decomposition/base.py
+++ b/sklearn/decomposition/base.py
@@ -17,7 +17,7 @@
 from abc import ABCMeta, abstractmethod
 
 
-class _BasePCA(BaseEstimator, TransformerMixin, metaclass=ABCMeta):
+class _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for PCA methods.
 
     Warning: This class should not be used directly.
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index 1a5a42d526917..05f06edc05934 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -932,7 +932,7 @@ def transform(self, X):
         return code
 
 
-class SparseCoder(BaseEstimator, SparseCodingMixin):
+class SparseCoder(SparseCodingMixin, BaseEstimator):
     """Sparse coding
 
     Finds a sparse representation of data against a fixed, precomputed
@@ -1045,7 +1045,7 @@ def fit(self, X, y=None):
         return self
 
 
-class DictionaryLearning(BaseEstimator, SparseCodingMixin):
+class DictionaryLearning(SparseCodingMixin, BaseEstimator):
     """Dictionary learning
 
     Finds a dictionary (a set of atoms) that can best be used to represent data
@@ -1241,7 +1241,7 @@ def fit(self, X, y=None):
         return self
 
 
-class MiniBatchDictionaryLearning(BaseEstimator, SparseCodingMixin):
+class MiniBatchDictionaryLearning(SparseCodingMixin, BaseEstimator):
     """Mini-batch dictionary learning
 
     Finds a dictionary (a set of atoms) that can best be used to represent data
diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py
index ba624140ce1fc..4fa48d5d0d88f 100644
--- a/sklearn/decomposition/factor_analysis.py
+++ b/sklearn/decomposition/factor_analysis.py
@@ -32,7 +32,7 @@
 from ..exceptions import ConvergenceWarning
 
 
-class FactorAnalysis(BaseEstimator, TransformerMixin):
+class FactorAnalysis(TransformerMixin, BaseEstimator):
     """Factor Analysis (FA)
 
     A simple linear generative model with Gaussian latent variables.
diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
index 3f6f1af632494..dffce0dc0d8bc 100644
--- a/sklearn/decomposition/fastica_.py
+++ b/sklearn/decomposition/fastica_.py
@@ -380,7 +380,7 @@ def g(x, fun_args):
                 return None, W, S
 
 
-class FastICA(BaseEstimator, TransformerMixin):
+class FastICA(TransformerMixin, BaseEstimator):
     """FastICA: a fast algorithm for Independent Component Analysis.
 
     Read more in the :ref:`User Guide <ICA>`.
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index 59785fed3ac0e..1429106495a6e 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -16,7 +16,7 @@
 from ..metrics.pairwise import pairwise_kernels
 
 
-class KernelPCA(BaseEstimator, TransformerMixin):
+class KernelPCA(TransformerMixin, BaseEstimator):
     """Kernel Principal component analysis (KPCA)
 
     Non-linear dimensionality reduction through the use of kernels (see
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
index 0cc8713679136..f96f048992687 100644
--- a/sklearn/decomposition/nmf.py
+++ b/sklearn/decomposition/nmf.py
@@ -1068,7 +1068,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
     return W, H, n_iter
 
 
-class NMF(BaseEstimator, TransformerMixin):
+class NMF(TransformerMixin, BaseEstimator):
     r"""Non-Negative Matrix Factorization (NMF)
 
     Find two non-negative matrices (W, H) whose product approximates the non-
diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py
index 694893b6b2dc4..f6fd1beeee35c 100644
--- a/sklearn/decomposition/online_lda.py
+++ b/sklearn/decomposition/online_lda.py
@@ -132,7 +132,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior,
     return (doc_topic_distr, suff_stats)
 
 
-class LatentDirichletAllocation(BaseEstimator, TransformerMixin):
+class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
     """Latent Dirichlet Allocation with online variational Bayes algorithm
 
     .. versionadded:: 0.17
diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py
index 3ca14cb528bb8..50f869fa4b1e8 100644
--- a/sklearn/decomposition/sparse_pca.py
+++ b/sklearn/decomposition/sparse_pca.py
@@ -29,7 +29,7 @@ def _check_normalize_components(normalize_components, estimator_name):
             )
 
 
-class SparsePCA(BaseEstimator, TransformerMixin):
+class SparsePCA(TransformerMixin, BaseEstimator):
     """Sparse Principal Components Analysis (SparsePCA)
 
     Finds the set of sparse components that can optimally reconstruct
diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
index ce79fba2fad1d..2440e90cb251e 100644
--- a/sklearn/decomposition/truncated_svd.py
+++ b/sklearn/decomposition/truncated_svd.py
@@ -18,7 +18,7 @@
 __all__ = ["TruncatedSVD"]
 
 
-class TruncatedSVD(BaseEstimator, TransformerMixin):
+class TruncatedSVD(TransformerMixin, BaseEstimator):
     """Dimensionality reduction using truncated SVD (aka LSA).
 
     This transformer performs linear dimensionality reduction by means of
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index efe39b8c3fb9a..7439033a8fe73 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -553,7 +553,7 @@ def predict_log_proba(self, X):
         return np.log(self.predict_proba(X))
 
 
-class QuadraticDiscriminantAnalysis(BaseEstimator, ClassifierMixin):
+class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     """Quadratic Discriminant Analysis
 
     A classifier with a quadratic decision boundary, generated
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 067a956f6435d..ab79321bd4fa3 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -19,7 +19,7 @@
 from .utils.multiclass import class_distribution
 
 
-class DummyClassifier(BaseEstimator, ClassifierMixin, MultiOutputMixin):
+class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
     """
     DummyClassifier is a classifier that makes predictions using simple rules.
 
@@ -353,7 +353,7 @@ def score(self, X, y, sample_weight=None):
         return super().score(X, y, sample_weight)
 
 
-class DummyRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
+class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     """
     DummyRegressor is a regressor that makes predictions using
     simple rules.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
index a6c779ca0a97b..18cddca2d867f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -83,7 +83,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
     return binning_thresholds
 
 
-class _BinMapper(BaseEstimator, TransformerMixin):
+class _BinMapper(TransformerMixin, BaseEstimator):
     """Transformer that maps a dataset into integer-valued bins.
 
     The bins are created in a feature-wise fashion, using quantiles so that
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 70a507d09c1c6..04598cc5fa4e3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -639,7 +639,7 @@ def n_iter_(self):
         return len(self._predictors)
 
 
-class HistGradientBoostingRegressor(BaseHistGradientBoosting, RegressorMixin):
+class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     """Histogram-based Gradient Boosting Regression Tree.
 
     This estimator is much faster than
diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
index 37dc5a97b4e67..cd6d18e743dbc 100644
--- a/sklearn/ensemble/bagging.py
+++ b/sklearn/ensemble/bagging.py
@@ -429,7 +429,7 @@ def estimators_samples_(self):
                 for _, sample_indices in self._get_estimators_indices()]
 
 
-class BaggingClassifier(BaseBagging, ClassifierMixin):
+class BaggingClassifier(ClassifierMixin, BaseBagging):
     """A Bagging classifier.
 
     A Bagging classifier is an ensemble meta-estimator that fits base
@@ -816,7 +816,7 @@ def decision_function(self, X):
         return decisions
 
 
-class BaggingRegressor(BaseBagging, RegressorMixin):
+class BaggingRegressor(RegressorMixin, BaseBagging):
     """A Bagging regressor.
 
     A Bagging regressor is an ensemble meta-estimator that fits base
diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py
index 379b376dd8de7..36c7b1067c381 100644
--- a/sklearn/ensemble/base.py
+++ b/sklearn/ensemble/base.py
@@ -58,7 +58,7 @@ def _set_random_states(estimator, random_state=None):
         estimator.set_params(**to_set)
 
 
-class BaseEnsemble(BaseEstimator, MetaEstimatorMixin, metaclass=ABCMeta):
+class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for all ensemble classes.
 
     Warning: This class should not be used directly. Use derived classes
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index df24411c4a974..856379f4f012e 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -122,7 +122,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
     return tree
 
 
-class BaseForest(BaseEnsemble, MultiOutputMixin, metaclass=ABCMeta):
+class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
     """Base class for forests of trees.
 
     Warning: This class should not be used directly. Use derived classes
@@ -392,7 +392,7 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
-class ForestClassifier(BaseForest, ClassifierMixin, metaclass=ABCMeta):
+class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """Base class for forest of trees-based classifiers.
 
     Warning: This class should not be used directly. Use derived classes
@@ -633,7 +633,7 @@ def predict_log_proba(self, X):
             return proba
 
 
-class ForestRegressor(BaseForest, RegressorMixin, metaclass=ABCMeta):
+class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
     """Base class for forest of trees-based regressors.
 
     Warning: This class should not be used directly. Use derived classes
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index ec5f9a111ccf1..a74cb3aa05e36 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -1810,7 +1810,7 @@ def apply(self, X):
         return leaves
 
 
-class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
+class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
     """Gradient Boosting for classification.
 
     GB builds an additive model in a
@@ -2286,7 +2286,7 @@ def staged_predict_proba(self, X):
                                  self.loss)
 
 
-class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
+class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
     """Gradient Boosting for regression.
 
     GB builds an additive model in a forward stage-wise fashion;
diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py
index 4cdeb9673ccdb..8b07eedabe19a 100644
--- a/sklearn/ensemble/iforest.py
+++ b/sklearn/ensemble/iforest.py
@@ -24,7 +24,7 @@
 __all__ = ["IsolationForest"]
 
 
-class IsolationForest(BaseBagging, OutlierMixin):
+class IsolationForest(OutlierMixin, BaseBagging):
     """Isolation Forest Algorithm
 
     Return the anomaly score of each sample using the IsolationForest algorithm
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 3ef1e8c09f27f..a02efe4d925d8 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -333,7 +333,7 @@ def test_sample_weight():
 
     # check that _parallel_fit_estimator will raise the right error
     # it should raise the original error if this is not linked to sample_weight
-    class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
+    class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
         def fit(self, X, y, sample_weight):
             raise TypeError('Error unrelated to sample_weight.')
     clf = ClassifierErrorFit()
@@ -343,7 +343,7 @@ def fit(self, X, y, sample_weight):
 
 def test_sample_weight_kwargs():
     """Check that VotingClassifier passes sample_weight as kwargs"""
-    class MockClassifier(BaseEstimator, ClassifierMixin):
+    class MockClassifier(ClassifierMixin, BaseEstimator):
         """Mock Classifier to check that sample_weight is received as kwargs"""
         def fit(self, X, y, *args, **sample_weight):
             assert 'sample_weight' in sample_weight
diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py
index 69381a39d9ce3..42487a2426a30 100644
--- a/sklearn/ensemble/voting.py
+++ b/sklearn/ensemble/voting.py
@@ -48,7 +48,7 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
     return estimator
 
 
-class _BaseVoting(_BaseComposition, TransformerMixin):
+class _BaseVoting(TransformerMixin, _BaseComposition):
     """Base class for voting.
 
     Warning: This class should not be used directly. Use derived classes
@@ -145,7 +145,7 @@ def get_params(self, deep=True):
         return self._get_params('estimators', deep=deep)
 
 
-class VotingClassifier(_BaseVoting, ClassifierMixin):
+class VotingClassifier(ClassifierMixin, _BaseVoting):
     """Soft Voting/Majority Rule classifier for unfitted estimators.
 
     .. versionadded:: 0.17
@@ -375,7 +375,7 @@ class labels predicted by each classifier.
             return self._predict(X)
 
 
-class VotingRegressor(_BaseVoting, RegressorMixin):
+class VotingRegressor(RegressorMixin, _BaseVoting):
     """Prediction voting regressor for unfitted estimators.
 
     .. versionadded:: 0.21
diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py
index b0a634ce1be6f..84dfecb7cf7e6 100644
--- a/sklearn/ensemble/weight_boosting.py
+++ b/sklearn/ensemble/weight_boosting.py
@@ -290,7 +290,7 @@ def _samme_proba(estimator, n_classes, X):
                               * log_proba.sum(axis=1)[:, np.newaxis])
 
 
-class AdaBoostClassifier(BaseWeightBoosting, ClassifierMixin):
+class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
     """An AdaBoost classifier.
 
     An AdaBoost [1] classifier is a meta-estimator that begins by fitting a
@@ -854,7 +854,7 @@ def predict_log_proba(self, X):
         return np.log(self.predict_proba(X))
 
 
-class AdaBoostRegressor(BaseWeightBoosting, RegressorMixin):
+class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
     """An AdaBoost regressor.
 
     An AdaBoost [1] regressor is a meta-estimator that begins by fitting a
diff --git a/sklearn/feature_extraction/dict_vectorizer.py b/sklearn/feature_extraction/dict_vectorizer.py
index 4a2aa58189c93..857806c892806 100644
--- a/sklearn/feature_extraction/dict_vectorizer.py
+++ b/sklearn/feature_extraction/dict_vectorizer.py
@@ -21,7 +21,7 @@ def _tosequence(X):
         return tosequence(X)
 
 
-class DictVectorizer(BaseEstimator, TransformerMixin):
+class DictVectorizer(TransformerMixin, BaseEstimator):
     """Transforms lists of feature-value mappings to vectors.
 
     This transformer turns lists of mappings (dict-like objects) of feature
diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
index b141d114f9fd8..703bb2deeb9d5 100644
--- a/sklearn/feature_extraction/hashing.py
+++ b/sklearn/feature_extraction/hashing.py
@@ -24,7 +24,7 @@ def _iteritems(d):
     return d.iteritems() if hasattr(d, "iteritems") else d.items()
 
 
-class FeatureHasher(BaseEstimator, TransformerMixin):
+class FeatureHasher(TransformerMixin, BaseEstimator):
     """Implements feature hashing, aka the hashing trick.
 
     This class turns sequences of symbolic feature names (strings) into
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 0c794d8a3776e..21851c12e48b1 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -469,7 +469,7 @@ def _validate_params(self):
                 % str(self.ngram_range))
 
 
-class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
+class HashingVectorizer(TransformerMixin, VectorizerMixin, BaseEstimator):
     """Convert a collection of text documents to a matrix of token occurrences
 
     It turns a collection of text documents into a scipy.sparse matrix holding
@@ -753,7 +753,7 @@ def _document_frequency(X):
         return np.diff(X.indptr)
 
 
-class CountVectorizer(BaseEstimator, VectorizerMixin):
+class CountVectorizer(VectorizerMixin, BaseEstimator):
     """Convert a collection of text documents to a matrix of token counts
 
     This implementation produces a sparse representation of the counts using
@@ -1229,7 +1229,7 @@ def _make_int_array():
     return array.array(str("i"))
 
 
-class TfidfTransformer(BaseEstimator, TransformerMixin):
+class TfidfTransformer(TransformerMixin, BaseEstimator):
     """Transform a count matrix to a normalized tf or tf-idf representation
 
     Tf means term-frequency while tf-idf means term-frequency times inverse
diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py
index fb26f9d685688..6d732d0e43dfd 100644
--- a/sklearn/feature_selection/from_model.py
+++ b/sklearn/feature_selection/from_model.py
@@ -78,7 +78,7 @@ def _calculate_threshold(estimator, importances, threshold):
     return threshold
 
 
-class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin):
+class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
     """Meta-transformer for selecting features based on importance weights.
 
     .. versionadded:: 0.17
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
index 4e957e8463a7c..1c63ae86b6196 100644
--- a/sklearn/feature_selection/rfe.py
+++ b/sklearn/feature_selection/rfe.py
@@ -34,7 +34,7 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
         _score(estimator, X_test[:, features], y_test, scorer)).scores_
 
 
-class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
+class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
     """Feature ranking with recursive feature elimination.
 
     Given an external estimator that assigns weights to features (e.g., the
diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py
index 5b1cae1823e9c..5921e3494469b 100644
--- a/sklearn/feature_selection/univariate_selection.py
+++ b/sklearn/feature_selection/univariate_selection.py
@@ -309,7 +309,7 @@ def f_regression(X, y, center=True):
 ######################################################################
 # Base classes
 
-class _BaseFilter(BaseEstimator, SelectorMixin):
+class _BaseFilter(SelectorMixin, BaseEstimator):
     """Initialize the univariate feature selection.
 
     Parameters
diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py
index c9eb973dc86c3..62323f1ff2ec8 100644
--- a/sklearn/feature_selection/variance_threshold.py
+++ b/sklearn/feature_selection/variance_threshold.py
@@ -9,7 +9,7 @@
 from ..utils.validation import check_is_fitted
 
 
-class VarianceThreshold(BaseEstimator, SelectorMixin):
+class VarianceThreshold(SelectorMixin, BaseEstimator):
     """Feature selector that removes all low-variance features.
 
     This feature selection algorithm looks only at the features (X), not the
diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py
index 5421f7e408472..2a3ce2dfa3170 100644
--- a/sklearn/gaussian_process/gpc.py
+++ b/sklearn/gaussian_process/gpc.py
@@ -449,7 +449,7 @@ def _constrained_optimization(self, obj_func, initial_theta, bounds):
         return theta_opt, func_min
 
 
-class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
+class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
     """Gaussian process classification (GPC) based on Laplace approximation.
 
     The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py
index cc9806cd1c41e..7d131c757bc78 100644
--- a/sklearn/gaussian_process/gpr.py
+++ b/sklearn/gaussian_process/gpr.py
@@ -19,8 +19,8 @@
 from ..utils.optimize import _check_optimize_result
 
 
-class GaussianProcessRegressor(BaseEstimator, RegressorMixin,
-                               MultiOutputMixin):
+class GaussianProcessRegressor(MultiOutputMixin,
+                               RegressorMixin, BaseEstimator):
     """Gaussian process regression (GPR).
 
     The implementation is based on Algorithm 2.1 of Gaussian Processes
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 39dbe2bb9a7e8..73be0b5485590 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -64,7 +64,7 @@ def _most_frequent(array, extra_value, n_repeat):
             return extra_value
 
 
-class SimpleImputer(BaseEstimator, TransformerMixin):
+class SimpleImputer(TransformerMixin, BaseEstimator):
     """Imputation transformer for completing missing values.
 
     Read more in the :ref:`User Guide <impute>`.
@@ -416,7 +416,7 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-class MissingIndicator(BaseEstimator, TransformerMixin):
+class MissingIndicator(TransformerMixin, BaseEstimator):
     """Binary indicators for missing values.
 
     Note that this component typically should not be used in a vanilla
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 25722c9bbf304..9a3ea79762ec9 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -25,7 +25,7 @@
                                                  'estimator'])
 
 
-class IterativeImputer(BaseEstimator, TransformerMixin):
+class IterativeImputer(TransformerMixin, BaseEstimator):
     """Multivariate imputer that estimates each feature from all the others.
 
     A strategy for imputing missing values by modeling each feature with
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 19399224e07ba..2543b4cc39b0b 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -287,7 +287,7 @@ def test_multiclass_multioutput(Estimator):
         partial_dependence(est, X, [0])
 
 
-class NoPredictProbaNoDecisionFunction(BaseEstimator, ClassifierMixin):
+class NoPredictProbaNoDecisionFunction(ClassifierMixin, BaseEstimator):
     def fit(self, X, y):
         # simulate that we have some classes
         self.classes_ = [0, 1]
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 40beb3abcab73..99f8d1f7aa9bb 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -137,7 +137,7 @@ def isotonic_regression(y, sample_weight=None, y_min=None, y_max=None,
     return y[order]
 
 
-class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
+class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator):
     """Isotonic regression model.
 
     The isotonic regression optimization problem is defined by::
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 7a2b404304daf..248f9595c5b95 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -21,7 +21,7 @@
 from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
 
 
-class RBFSampler(BaseEstimator, TransformerMixin):
+class RBFSampler(TransformerMixin, BaseEstimator):
     """Approximates feature map of an RBF kernel by Monte Carlo approximation
     of its Fourier transform.
 
@@ -125,7 +125,7 @@ def transform(self, X):
         return projection
 
 
-class SkewedChi2Sampler(BaseEstimator, TransformerMixin):
+class SkewedChi2Sampler(TransformerMixin, BaseEstimator):
     """Approximates feature map of the "skewed chi-squared" kernel by Monte
     Carlo approximation of its Fourier transform.
 
@@ -239,7 +239,7 @@ def transform(self, X):
         return projection
 
 
-class AdditiveChi2Sampler(BaseEstimator, TransformerMixin):
+class AdditiveChi2Sampler(TransformerMixin, BaseEstimator):
     """Approximate feature map for additive chi2 kernel.
 
     Uses sampling the fourier transform of the kernel characteristic
@@ -429,7 +429,7 @@ def _more_tags(self):
         return {'stateless': True}
 
 
-class Nystroem(BaseEstimator, TransformerMixin):
+class Nystroem(TransformerMixin, BaseEstimator):
     """Approximate a kernel map using a subset of the training data.
 
     Constructs an approximate feature map for an arbitrary kernel
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 3d69066e342d6..fef571056c945 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -13,7 +13,7 @@
 from .utils.validation import check_is_fitted
 
 
-class KernelRidge(BaseEstimator, RegressorMixin, MultiOutputMixin):
+class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
     """Kernel ridge regression.
 
     Kernel ridge regression (KRR) combines ridge regression (linear least
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index d2af98d07ac09..c554c8a921d9e 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -362,7 +362,7 @@ def sparsify(self):
         return self
 
 
-class LinearRegression(LinearModel, RegressorMixin, MultiOutputMixin):
+class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     """
     Ordinary least squares Linear Regression.
 
diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py
index 7b19ed3ce607f..a2c1be45c50bd 100644
--- a/sklearn/linear_model/bayes.py
+++ b/sklearn/linear_model/bayes.py
@@ -19,7 +19,7 @@
 ###############################################################################
 # BayesianRidge regression
 
-class BayesianRidge(LinearModel, RegressorMixin):
+class BayesianRidge(RegressorMixin, LinearModel):
     """Bayesian ridge regression.
 
     Fit a Bayesian ridge model. See the Notes section for details on this
@@ -375,7 +375,7 @@ def _log_marginal_likelihood(self, n_samples, n_features, eigen_vals,
 # ARD (Automatic Relevance Determination) regression
 
 
-class ARDRegression(LinearModel, RegressorMixin):
+class ARDRegression(RegressorMixin, LinearModel):
     """Bayesian ARD regression.
 
     Fit the weights of a regression model, using an ARD prior. The weights of
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 35bfcb692ca2f..030c8fe8d4bdf 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -498,7 +498,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
 # ElasticNet model
 
 
-class ElasticNet(LinearModel, RegressorMixin, MultiOutputMixin):
+class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
     """Linear regression with combined L1 and L2 priors as regularizer.
 
     Minimizes the objective function::
@@ -1035,7 +1035,7 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
     return this_mses
 
 
-class LinearModelCV(LinearModel, MultiOutputMixin, metaclass=ABCMeta):
+class LinearModelCV(MultiOutputMixin, LinearModel, metaclass=ABCMeta):
     """Base class for iterative model fitting along a regularization path"""
 
     @abstractmethod
@@ -1231,7 +1231,7 @@ def fit(self, X, y):
         return self
 
 
-class LassoCV(LinearModelCV, RegressorMixin):
+class LassoCV(RegressorMixin, LinearModelCV):
     """Lasso linear model with iterative fitting along a regularization path.
 
     See glossary entry for :term:`cross-validation estimator`.
@@ -1397,7 +1397,7 @@ def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
             random_state=random_state, selection=selection)
 
 
-class ElasticNetCV(LinearModelCV, RegressorMixin):
+class ElasticNetCV(RegressorMixin, LinearModelCV):
     """Elastic Net model with iterative fitting along a regularization path.
 
     See glossary entry for :term:`cross-validation estimator`.
@@ -1917,7 +1917,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
         self.selection = selection
 
 
-class MultiTaskElasticNetCV(LinearModelCV, RegressorMixin):
+class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     """Multi-task L1/L2 ElasticNet with built-in cross-validation.
 
     See glossary entry for :term:`cross-validation estimator`.
@@ -2105,7 +2105,7 @@ def _more_tags(self):
         return {'multioutput_only': True}
 
 
-class MultiTaskLassoCV(LinearModelCV, RegressorMixin):
+class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.
 
     See glossary entry for :term:`cross-validation estimator`.
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 2df43cca9365f..6fa3ae3008a35 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -761,7 +761,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500,
 ###############################################################################
 # Estimator classes
 
-class Lars(LinearModel, RegressorMixin, MultiOutputMixin):
+class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     """Least Angle Regression model a.k.a. LAR
 
     Read more in the :ref:`User Guide <least_angle_regression>`.
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index df6e44f5708e0..ff8dcac367414 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -539,7 +539,7 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
         return np.squeeze(coef)
 
 
-class OrthogonalMatchingPursuit(LinearModel, RegressorMixin, MultiOutputMixin):
+class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel):
     """Orthogonal Matching Pursuit model (OMP)
 
     Read more in the :ref:`User Guide <omp>`.
@@ -753,7 +753,7 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True,
     return np.dot(coefs.T, X_test.T) - y_test
 
 
-class OrthogonalMatchingPursuitCV(LinearModel, RegressorMixin):
+class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):
     """Cross-validated Orthogonal Matching Pursuit model (OMP).
 
     See glossary entry for :term:`cross-validation estimator`.
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
index e868a31d17c8d..3d390c5c67e61 100644
--- a/sklearn/linear_model/ransac.py
+++ b/sklearn/linear_model/ransac.py
@@ -53,8 +53,8 @@ def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
     return abs(float(np.ceil(np.log(nom) / np.log(denom))))
 
 
-class RANSACRegressor(BaseEstimator, MetaEstimatorMixin, RegressorMixin,
-                      MultiOutputMixin):
+class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
+                      MultiOutputMixin, BaseEstimator):
     """RANSAC (RANdom SAmple Consensus) algorithm.
 
     RANSAC is an iterative algorithm for the robust estimation of parameters
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index b1c24a5860227..5e9a58e90b4b2 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -521,7 +521,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
         return coef
 
 
-class _BaseRidge(LinearModel, MultiOutputMixin, metaclass=ABCMeta):
+class _BaseRidge(MultiOutputMixin, LinearModel, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, solver="auto",
@@ -602,7 +602,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
 
-class Ridge(_BaseRidge, RegressorMixin):
+class Ridge(RegressorMixin, _BaseRidge):
     """Linear least squares with l2 regularization.
 
     Minimizes the objective function::
@@ -1505,7 +1505,7 @@ def identity_estimator():
         return self
 
 
-class _BaseRidgeCV(LinearModel, MultiOutputMixin):
+class _BaseRidgeCV(MultiOutputMixin, LinearModel):
     def __init__(self, alphas=(0.1, 1.0, 10.0),
                  fit_intercept=True, normalize=False, scoring=None,
                  cv=None, gcv_mode=None,
@@ -1577,7 +1577,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
 
-class RidgeCV(_BaseRidgeCV, RegressorMixin):
+class RidgeCV(RegressorMixin, _BaseRidgeCV):
     """Ridge regression with built-in cross-validation.
 
     See glossary entry for :term:`cross-validation estimator`.
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
index c56792de96172..9314013dce0a6 100644
--- a/sklearn/linear_model/stochastic_gradient.py
+++ b/sklearn/linear_model/stochastic_gradient.py
@@ -65,7 +65,7 @@ def __call__(self, coef, intercept):
         return est.score(self.X_val, self.y_val, self.sample_weight_val)
 
 
-class BaseSGD(BaseEstimator, SparseCoefMixin, metaclass=ABCMeta):
+class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for SGD classification and regression."""
 
     def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0,
@@ -420,7 +420,7 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
     return result
 
 
-class BaseSGDClassifier(BaseSGD, LinearClassifierMixin, metaclass=ABCMeta):
+class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
 
     loss_functions = {
         "hinge": (Hinge, 1.0),
@@ -1050,7 +1050,7 @@ def _predict_log_proba(self, X):
         return np.log(self.predict_proba(X))
 
 
-class BaseSGDRegressor(BaseSGD, RegressorMixin):
+class BaseSGDRegressor(RegressorMixin, BaseSGD):
 
     loss_functions = {
         "squared_loss": (SquaredLoss, ),
diff --git a/sklearn/linear_model/theil_sen.py b/sklearn/linear_model/theil_sen.py
index 941c51196cc4a..3468e904c3538 100644
--- a/sklearn/linear_model/theil_sen.py
+++ b/sklearn/linear_model/theil_sen.py
@@ -193,7 +193,7 @@ def _lstsq(X, y, indices, fit_intercept):
     return weights
 
 
-class TheilSenRegressor(LinearModel, RegressorMixin):
+class TheilSenRegressor(RegressorMixin, LinearModel):
     """Theil-Sen Estimator: robust multivariate regression model.
 
     The algorithm calculates least square solutions on subsets with size
diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py
index 88c979c0e1fdb..9243f69a4a127 100644
--- a/sklearn/manifold/isomap.py
+++ b/sklearn/manifold/isomap.py
@@ -12,7 +12,7 @@
 from ..preprocessing import KernelCenterer
 
 
-class Isomap(BaseEstimator, TransformerMixin):
+class Isomap(TransformerMixin, BaseEstimator):
     """Isomap Embedding
 
     Non-linear dimensionality reduction through Isometric Mapping
diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
index 4e90d4876f4df..d758d42e6b62c 100644
--- a/sklearn/manifold/locally_linear.py
+++ b/sklearn/manifold/locally_linear.py
@@ -518,8 +518,8 @@ def locally_linear_embedding(
                       tol=tol, max_iter=max_iter, random_state=random_state)
 
 
-class LocallyLinearEmbedding(BaseEstimator, TransformerMixin,
-                             _UnstableArchMixin):
+class LocallyLinearEmbedding(TransformerMixin,
+                             _UnstableArchMixin, BaseEstimator):
     """Locally Linear Embedding
 
     Read more in the :ref:`User Guide <locally_linear_embedding>`.
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 02380722d0d95..b61b3f9467d50 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -397,7 +397,7 @@ def _check_param_grid(param_grid):
                                  "to be a non-empty sequence.".format(name))
 
 
-class BaseSearchCV(BaseEstimator, MetaEstimatorMixin, metaclass=ABCMeta):
+class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """Abstract base class for hyper parameter search with cross-validation.
     """
 
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 217f6ce87cba6..bced2fad2c36d 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -129,8 +129,8 @@ def predict_proba(self, X):
                          X.shape[0], axis=0)
 
 
-class OneVsRestClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin,
-                          MultiOutputMixin):
+class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
+                          MetaEstimatorMixin, BaseEstimator):
     """One-vs-the-rest (OvR) multiclass/multilabel strategy
 
     Also known as one-vs-all, this strategy consists in fitting one classifier
@@ -435,7 +435,7 @@ def _partial_fit_ovo_binary(estimator, X, y, i, j):
     return estimator
 
 
-class OneVsOneClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     """One-vs-one multiclass strategy
 
     This strategy consists in fitting one classifier per class pair.
@@ -562,7 +562,7 @@ def partial_fit(self, X, y, classes=None):
                 delayed(_partial_fit_ovo_binary)(
                     estimator, X, y, self.classes_[i], self.classes_[j])
                 for estimator, (i, j) in zip(self.estimators_,
-                                              (combinations)))
+                                             (combinations)))
 
         self.pairwise_indices_ = None
 
@@ -634,7 +634,7 @@ def _pairwise(self):
         return getattr(self.estimator, "_pairwise", False)
 
 
-class OutputCodeClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     """(Error-Correcting) Output-Code multiclass strategy
 
     Output-code based strategies consist in representing each class with a
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 3b5a95349868e..e2cafe02efaa8 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -202,7 +202,7 @@ def _more_tags(self):
         return {'multioutput_only': True}
 
 
-class MultiOutputRegressor(MultiOutputEstimator, RegressorMixin):
+class MultiOutputRegressor(RegressorMixin, MultiOutputEstimator):
     """Multi target regression
 
     This strategy consists of fitting one regressor per target. This is a
@@ -297,7 +297,7 @@ def score(self, X, y, sample_weight=None):
                         multioutput='uniform_average')
 
 
-class MultiOutputClassifier(MultiOutputEstimator, ClassifierMixin):
+class MultiOutputClassifier(ClassifierMixin, MultiOutputEstimator):
     """Multi target classification
 
     This strategy consists of fitting one classifier per target. This is a
@@ -515,7 +515,7 @@ def predict(self, X):
         return Y_pred
 
 
-class ClassifierChain(_BaseChain, ClassifierMixin, MetaEstimatorMixin):
+class ClassifierChain(MetaEstimatorMixin, ClassifierMixin, _BaseChain):
     """A multi-label model that arranges binary classifiers into a chain.
 
     Each model makes a prediction in the order specified by the chain using
@@ -675,7 +675,7 @@ def _more_tags(self):
                 'multioutput_only': True}
 
 
-class RegressorChain(_BaseChain, RegressorMixin, MetaEstimatorMixin):
+class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain):
     """A multi-label model that arranges regressions into a chain.
 
     Each model makes a prediction in the order specified by the chain using
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 904a5afecc67e..b3007e026e3db 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -35,7 +35,7 @@
 __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB']
 
 
-class BaseNB(BaseEstimator, ClassifierMixin, metaclass=ABCMeta):
+class BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
     """Abstract base class for naive Bayes estimators"""
 
     @abstractmethod
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index bbd2d74c4a716..9548a619b0b14 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -103,7 +103,7 @@ def _get_weights(dist, weights):
                          "'distance', or a callable function")
 
 
-class NeighborsBase(BaseEstimator, MultiOutputMixin, metaclass=ABCMeta):
+class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for nearest neighbors estimators."""
 
     @abstractmethod
diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py
index 68a72c92da865..aae5c4d6c8267 100644
--- a/sklearn/neighbors/nca.py
+++ b/sklearn/neighbors/nca.py
@@ -27,7 +27,7 @@
 from ..exceptions import ConvergenceWarning
 
 
-class NeighborhoodComponentsAnalysis(BaseEstimator, TransformerMixin):
+class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):
     """Neighborhood Components Analysis
 
     Neighborhood Component Analysis (NCA) is a machine learning algorithm for
diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
index 3e1577469c920..3967e772bf1bb 100644
--- a/sklearn/neighbors/nearest_centroid.py
+++ b/sklearn/neighbors/nearest_centroid.py
@@ -20,7 +20,7 @@
 from ..utils.multiclass import check_classification_targets
 
 
-class NearestCentroid(BaseEstimator, ClassifierMixin):
+class NearestCentroid(ClassifierMixin, BaseEstimator):
     """Nearest centroid classifier.
 
     Each class is represented by its centroid, with test samples classified to
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
index 11e682a448240..b6367d32e57a9 100644
--- a/sklearn/neural_network/multilayer_perceptron.py
+++ b/sklearn/neural_network/multilayer_perceptron.py
@@ -688,7 +688,7 @@ def _predict(self, X):
         return y_pred
 
 
-class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
+class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
     """Multi-layer Perceptron classifier.
 
     This model optimizes the log-loss function using LBFGS or stochastic
@@ -1080,7 +1080,7 @@ def predict_proba(self, X):
             return y_pred
 
 
-class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
+class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
     """Multi-layer Perceptron regressor.
 
     This model optimizes the squared-loss using LBFGS or stochastic gradient
diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py
index 3018e31f7d04d..efe3aeda951af 100644
--- a/sklearn/neural_network/rbm.py
+++ b/sklearn/neural_network/rbm.py
@@ -23,7 +23,7 @@
 from ..utils.validation import check_is_fitted
 
 
-class BernoulliRBM(BaseEstimator, TransformerMixin):
+class BernoulliRBM(TransformerMixin, BaseEstimator):
     """Bernoulli Restricted Boltzmann Machine (RBM).
 
     A Restricted Boltzmann Machine with binary visible units and
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 3d0207a4c16fc..a58979142ae7c 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -744,7 +744,7 @@ def _fit_one(transformer,
         return transformer.fit(X, y, **fit_params)
 
 
-class FeatureUnion(_BaseComposition, TransformerMixin):
+class FeatureUnion(TransformerMixin, _BaseComposition):
     """Concatenates results of multiple transformer objects.
 
     This estimator applies a list of transformer objects in parallel to the
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 1be7499f783ec..94fcd50f0270b 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -18,7 +18,7 @@
 from ..utils.validation import FLOAT_DTYPES
 
 
-class KBinsDiscretizer(BaseEstimator, TransformerMixin):
+class KBinsDiscretizer(TransformerMixin, BaseEstimator):
     """Bin continuous data into intervals.
 
     Read more in the :ref:`User Guide <preprocessing_discretization>`.
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c33744204fc36..0ee5d32720e63 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -19,7 +19,7 @@
 ]
 
 
-class _BaseEncoder(BaseEstimator, TransformerMixin):
+class _BaseEncoder(TransformerMixin, BaseEstimator):
     """
     Base class for encoders that includes the code to categorize and
     transform the input features.
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 589a45a1e63d1..832857c0ad5dc 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -11,7 +11,7 @@ def _identity(X):
     return X
 
 
-class FunctionTransformer(BaseEstimator, TransformerMixin):
+class FunctionTransformer(TransformerMixin, BaseEstimator):
     """Constructs a transformer from an arbitrary callable.
 
     A FunctionTransformer forwards its X (and optionally y) arguments to a
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index b3f09664f025d..379fe41e8a541 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -196,7 +196,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     return X
 
 
-class MinMaxScaler(BaseEstimator, TransformerMixin):
+class MinMaxScaler(TransformerMixin, BaseEstimator):
     """Transforms features by scaling each feature to a given range.
 
     This estimator scales and translates each feature individually such
@@ -493,7 +493,7 @@ def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
     return X
 
 
-class StandardScaler(BaseEstimator, TransformerMixin):
+class StandardScaler(TransformerMixin, BaseEstimator):
     """Standardize features by removing the mean and scaling to unit variance
 
     The standard score of a sample `x` is calculated as:
@@ -821,7 +821,7 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-class MaxAbsScaler(BaseEstimator, TransformerMixin):
+class MaxAbsScaler(TransformerMixin, BaseEstimator):
     """Scale each feature by its maximum absolute value.
 
     This estimator scales and translates each feature individually such
@@ -1050,7 +1050,7 @@ def maxabs_scale(X, axis=0, copy=True):
     return X
 
 
-class RobustScaler(BaseEstimator, TransformerMixin):
+class RobustScaler(TransformerMixin, BaseEstimator):
     """Scale features using statistics that are robust to outliers.
 
     This Scaler removes the median and scales the data according to
@@ -1328,7 +1328,7 @@ def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
     return X
 
 
-class PolynomialFeatures(BaseEstimator, TransformerMixin):
+class PolynomialFeatures(TransformerMixin, BaseEstimator):
     """Generate polynomial and interaction features.
 
     Generate a new feature matrix consisting of all polynomial combinations
@@ -1701,7 +1701,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
         return X
 
 
-class Normalizer(BaseEstimator, TransformerMixin):
+class Normalizer(TransformerMixin, BaseEstimator):
     """Normalize samples individually to unit norm.
 
     Each sample (i.e. each row of the data matrix) with at least one
@@ -1839,7 +1839,7 @@ def binarize(X, threshold=0.0, copy=True):
     return X
 
 
-class Binarizer(BaseEstimator, TransformerMixin):
+class Binarizer(TransformerMixin, BaseEstimator):
     """Binarize data (set feature values to 0 or 1) according to a threshold
 
     Values greater than the threshold map to 1, while values less than
@@ -1930,7 +1930,7 @@ def _more_tags(self):
         return {'stateless': True}
 
 
-class KernelCenterer(BaseEstimator, TransformerMixin):
+class KernelCenterer(TransformerMixin, BaseEstimator):
     """Center a kernel matrix
 
     Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a
@@ -2091,7 +2091,7 @@ def add_dummy_feature(X, value=1.0):
         return np.hstack((np.full((n_samples, 1), value), X))
 
 
-class QuantileTransformer(BaseEstimator, TransformerMixin):
+class QuantileTransformer(TransformerMixin, BaseEstimator):
     """Transform features using quantiles information.
 
     This method transforms the features to follow a uniform or a normal
@@ -2635,7 +2635,7 @@ def quantile_transform(X, axis=0, n_quantiles=1000,
                          " axis={}".format(axis))
 
 
-class PowerTransformer(BaseEstimator, TransformerMixin):
+class PowerTransformer(TransformerMixin, BaseEstimator):
     """Apply a power transform featurewise to make data more Gaussian-like.
 
     Power transforms are a family of parametric, monotonic transformations
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f16c7588fe13c..35c8f042d2db2 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -168,7 +168,7 @@ def _encode_check_unknown(values, uniques, return_mask=False):
             return diff
 
 
-class LabelEncoder(BaseEstimator, TransformerMixin):
+class LabelEncoder(TransformerMixin, BaseEstimator):
     """Encode target labels with value between 0 and n_classes-1.
 
     This transformer should be used to encode target values, *i.e.* `y`, and
@@ -300,7 +300,7 @@ def _more_tags(self):
         return {'X_types': ['1dlabels']}
 
 
-class LabelBinarizer(BaseEstimator, TransformerMixin):
+class LabelBinarizer(TransformerMixin, BaseEstimator):
     """Binarize labels in a one-vs-all fashion
 
     Several regression and binary classification algorithms are
@@ -781,7 +781,7 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold):
         raise ValueError("{0} format is not supported".format(output_type))
 
 
-class MultiLabelBinarizer(BaseEstimator, TransformerMixin):
+class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
     """Transform between iterable of iterables and a multilabel format
 
     Although a list of sets or tuples is a very intuitive format for multilabel
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 4f8c8af1283b2..97597dd330e31 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -289,7 +289,7 @@ def sparse_random_matrix(n_components, n_features, density='auto',
         return np.sqrt(1 / density) / np.sqrt(n_components) * components
 
 
-class BaseRandomProjection(BaseEstimator, TransformerMixin, metaclass=ABCMeta):
+class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for random projections.
 
     Warning: This class should not be used directly.
diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py
index 704a075d95932..0cbc59e3e69d8 100644
--- a/sklearn/semi_supervised/label_propagation.py
+++ b/sklearn/semi_supervised/label_propagation.py
@@ -71,7 +71,7 @@
 from ..exceptions import ConvergenceWarning
 
 
-class BaseLabelPropagation(BaseEstimator, ClassifierMixin, metaclass=ABCMeta):
+class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for label propagation module.
 
     Parameters
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index b2723cc7e0c2b..7120a10d6a504 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -485,7 +485,7 @@ def _get_coef(self):
         return safe_sparse_dot(self._dual_coef_, self.support_vectors_)
 
 
-class BaseSVC(BaseLibSVM, ClassifierMixin, metaclass=ABCMeta):
+class BaseSVC(ClassifierMixin, BaseLibSVM, metaclass=ABCMeta):
     """ABC for LibSVM-based classifiers."""
     @abstractmethod
     def __init__(self, kernel, degree, gamma, coef0, tol, C, nu,
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 39c7d2f334de2..a350718b3a821 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -250,7 +250,7 @@ def fit(self, X, y, sample_weight=None):
         return self
 
 
-class LinearSVR(LinearModel, RegressorMixin):
+class LinearSVR(RegressorMixin, LinearModel):
     """Linear Support Vector Regression.
 
     Similar to SVR with parameter kernel='linear', but implemented in terms of
@@ -821,7 +821,7 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale',
             random_state=random_state)
 
 
-class SVR(BaseLibSVM, RegressorMixin):
+class SVR(RegressorMixin, BaseLibSVM):
     """Epsilon-Support Vector Regression.
 
     The free parameters in the model are C and epsilon.
@@ -951,7 +951,7 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale',
             class_weight=None, max_iter=max_iter, random_state=None)
 
 
-class NuSVR(BaseLibSVM, RegressorMixin):
+class NuSVR(RegressorMixin, BaseLibSVM):
     """Nu Support Vector Regression.
 
     Similar to NuSVC, for regression, uses a parameter nu to control
@@ -1076,7 +1076,7 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3,
             verbose=verbose, max_iter=max_iter, random_state=None)
 
 
-class OneClassSVM(BaseLibSVM, OutlierMixin):
+class OneClassSVM(OutlierMixin, BaseLibSVM):
     """Unsupervised Outlier Detection.
 
     Estimate the support of a high-dimensional distribution.
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index c47925b22c92e..d83c9c99e2105 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -63,6 +63,11 @@ def _more_tags(self):
 
 
 class DiamondOverwriteTag(NaNTag, NoNaNTag):
+    def _more_tags(self):
+        return dict()
+
+
+class InheritDiamondOverwriteTag(DiamondOverwriteTag):
     pass
 
 
@@ -293,7 +298,7 @@ def test_score_sample_weight():
 
 def test_clone_pandas_dataframe():
 
-    class DummyEstimator(BaseEstimator, TransformerMixin):
+    class DummyEstimator(TransformerMixin, BaseEstimator):
         """This is a dummy class for generating numerical features
 
         This feature extractor extracts numerical features from pandas data
@@ -408,7 +413,7 @@ def __setstate__(self, state):
         self.__dict__.update(state)
 
 
-class MultiInheritanceEstimator(BaseEstimator, DontPickleAttributeMixin):
+class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator):
     def __init__(self, attribute_pickled=5):
         self.attribute_pickled = attribute_pickled
         self._attribute_not_pickled = None
@@ -475,13 +480,14 @@ def test_tag_inheritance():
     assert nan_tag_est._get_tags()['allow_nan']
     assert not no_nan_tag_est._get_tags()['allow_nan']
 
-    invalid_tags_est = OverrideTag()
-    with pytest.raises(TypeError, match="Inconsistent values for tag"):
-        invalid_tags_est._get_tags()
+    redefine_tags_est = OverrideTag()
+    assert not redefine_tags_est._get_tags()['allow_nan']
 
     diamond_tag_est = DiamondOverwriteTag()
-    with pytest.raises(TypeError, match="Inconsistent values for tag"):
-        diamond_tag_est._get_tags()
+    assert diamond_tag_est._get_tags()['allow_nan']
+
+    inherit_diamond_tag_est = InheritDiamondOverwriteTag()
+    assert inherit_diamond_tag_est._get_tags()['allow_nan']
 
 
 # XXX: Remove in 0.23
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index a2362df70f9df..ebe541d77f055 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -73,7 +73,7 @@
 # =============================================================================
 
 
-class BaseDecisionTree(BaseEstimator, MultiOutputMixin, metaclass=ABCMeta):
+class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for decision trees.
 
     Warning: This class should not be used directly.
@@ -597,7 +597,7 @@ def feature_importances_(self):
 # Public estimators
 # =============================================================================
 
-class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
+class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     """A decision tree classifier.
 
     Read more in the :ref:`User Guide <tree>`.
@@ -972,7 +972,7 @@ def predict_log_proba(self, X):
             return proba
 
 
-class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
+class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
     """A decision tree regressor.
 
     Read more in the :ref:`User Guide <tree>`.
diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py
index 76ad144ccb171..45ac89f992a78 100644
--- a/sklearn/utils/mocking.py
+++ b/sklearn/utils/mocking.py
@@ -48,7 +48,7 @@ def __ne__(self, other):
         return not self == other
 
 
-class CheckingClassifier(BaseEstimator, ClassifierMixin):
+class CheckingClassifier(ClassifierMixin, BaseEstimator):
     """Dummy classifier to test pipelining and meta-estimators.
 
     Checks some property of X and y in fit / predict.
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index c3930e773a46e..e26a508566871 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -43,7 +43,7 @@ class CorrectNotFittedError(ValueError):
     """
 
 
-class BaseBadClassifier(BaseEstimator, ClassifierMixin):
+class BaseBadClassifier(ClassifierMixin, BaseEstimator):
     def fit(self, X, y):
         return self
 
diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py
index 8f3c13b1cf844..556d57e9f8dfa 100644
--- a/sklearn/utils/tests/test_pprint.py
+++ b/sklearn/utils/tests/test_pprint.py
@@ -41,7 +41,7 @@ def fit(self, X, y):
         return self
 
 
-class StandardScaler(BaseEstimator, TransformerMixin):
+class StandardScaler(TransformerMixin, BaseEstimator):
     def __init__(self, copy=True, with_mean=True, with_std=True):
         self.with_mean = with_mean
         self.with_std = with_std