diff --git a/examples/applications/face_recognition.py b/examples/applications/plot_face_recognition.py
similarity index 100%
rename from examples/applications/face_recognition.py
rename to examples/applications/plot_face_recognition.py
diff --git a/examples/applications/topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py
similarity index 89%
rename from examples/applications/topics_extraction_with_nmf_lda.py
rename to examples/applications/plot_topics_extraction_with_nmf_lda.py
index d4ed9607073c7..e1a6f0bdbacd9 100644
--- a/examples/applications/topics_extraction_with_nmf_lda.py
+++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py
@@ -1,13 +1,14 @@
 """
-=======================================================================================
-Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation
-=======================================================================================
+========================================================
+Topic extraction with Non-negative Matrix Factorization\
+and Latent Dirichlet Allocation
+========================================================
 
-This is an example of applying :class:`sklearn.decomposition.NMF`
-and :class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus of documents and
-extract additive models of the topic structure of the corpus.
-The output is a list of topics, each represented as a list of terms
-(weights are not shown).
+This is an example of applying :class:`sklearn.decomposition.NMF` and
+:class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus
+of documents and extract additive models of the topic structure of the
+corpus.  The output is a list of topics, each represented as a list of
+terms (weights are not shown).
 
 Non-negative Matrix Factorization is applied with two different objective
 functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
@@ -18,6 +19,7 @@
 increase the dimensions of the problem, but be aware that the time
 complexity is polynomial in NMF. In LDA, the time complexity is
 proportional to (n_samples * iterations).
+
 """
 
 # Author: Olivier Grisel <olivier.grisel@ensta.org>
diff --git a/examples/bicluster/bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py
similarity index 94%
rename from examples/bicluster/bicluster_newsgroups.py
rename to examples/bicluster/plot_bicluster_newsgroups.py
index 8102f2293e604..0c7104c8f35ad 100644
--- a/examples/bicluster/bicluster_newsgroups.py
+++ b/examples/bicluster/plot_bicluster_newsgroups.py
@@ -33,23 +33,30 @@
     ----------------
     bicluster 0 : 1951 documents, 4373 words
     categories   : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
-    words        : gun, guns, geb, banks, firearms, drugs, gordon, clinton, cdt, amendment
+    words        : gun, guns, geb, banks, firearms, drugs, gordon, clinton,
+                   cdt, amendment
 
     bicluster 1 : 1165 documents, 3304 words
-    categories   : 29% talk.politics.mideast, 26% soc.religion.christian, 25% alt.atheism
-    words        : god, jesus, christians, atheists, kent, sin, morality, belief, resurrection, marriage
+    categories   : 29% talk.politics.mideast, 26% soc.religion.christian,
+                   25% alt.atheism
+    words        : god, jesus, christians, atheists, kent, sin, morality,
+                   belief, resurrection, marriage
 
     bicluster 2 : 2219 documents, 2830 words
-    categories   : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware, 16% comp.graphics
-    words        : voltage, dsp, board, receiver, circuit, shipping, packages, stereo, compression, package
+    categories   : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,
+                   16% comp.graphics
+    words        : voltage, dsp, board, receiver, circuit, shipping, packages,
+                   stereo, compression, package
 
     bicluster 3 : 1860 documents, 2745 words
     categories   : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
-    words        : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw, bikes
+    words        : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,
+                   bikes
 
     bicluster 4 : 12 documents, 155 words
     categories   : 100% rec.sport.hockey
-    words        : scorer, unassisted, reichel, semak, sweeney, kovalenko, ricci, audette, momesso, nedved
+    words        : scorer, unassisted, reichel, semak, sweeney, kovalenko,
+                   ricci, audette, momesso, nedved
 
 """
 from __future__ import print_function
@@ -132,8 +139,8 @@ def bicluster_ncut(i):
         return sys.float_info.max
     row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]
     col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]
-    # Note: the following is identical to X[rows[:, np.newaxis], cols].sum() but
-    # much faster in scipy <= 0.16
+    # Note: the following is identical to X[rows[:, np.newaxis],
+    # cols].sum() but much faster in scipy <= 0.16
     weight = X[rows][:, cols].sum()
     cut = (X[row_complement][:, cols].sum() +
            X[rows][:, col_complement].sum())
diff --git a/examples/exercises/digits_classification_exercise.py b/examples/exercises/plot_digits_classification_exercise.py
similarity index 82%
rename from examples/exercises/digits_classification_exercise.py
rename to examples/exercises/plot_digits_classification_exercise.py
index a1f0b84fd1fd2..25ab7e71c5925 100644
--- a/examples/exercises/digits_classification_exercise.py
+++ b/examples/exercises/plot_digits_classification_exercise.py
@@ -20,10 +20,10 @@
 
 n_samples = len(X_digits)
 
-X_train = X_digits[:.9 * n_samples]
-y_train = y_digits[:.9 * n_samples]
-X_test = X_digits[.9 * n_samples:]
-y_test = y_digits[.9 * n_samples:]
+X_train = X_digits[:int(.9 * n_samples)]
+y_train = y_digits[:int(.9 * n_samples)]
+X_test = X_digits[int(.9 * n_samples):]
+y_test = y_digits[int(.9 * n_samples):]
 
 knn = neighbors.KNeighborsClassifier()
 logistic = linear_model.LogisticRegression()
diff --git a/examples/feature_selection/feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
similarity index 100%
rename from examples/feature_selection/feature_selection_pipeline.py
rename to examples/feature_selection/plot_feature_selection_pipeline.py
diff --git a/examples/linear_model/lasso_dense_vs_sparse_data.py b/examples/linear_model/plot_lasso_dense_vs_sparse_data.py
similarity index 100%
rename from examples/linear_model/lasso_dense_vs_sparse_data.py
rename to examples/linear_model/plot_lasso_dense_vs_sparse_data.py
diff --git a/examples/model_selection/grid_search_digits.py b/examples/model_selection/plot_grid_search_digits.py
similarity index 100%
rename from examples/model_selection/grid_search_digits.py
rename to examples/model_selection/plot_grid_search_digits.py
diff --git a/examples/model_selection/randomized_search.py b/examples/model_selection/plot_randomized_search.py
similarity index 100%
rename from examples/model_selection/randomized_search.py
rename to examples/model_selection/plot_randomized_search.py
diff --git a/examples/feature_stacker.py b/examples/plot_feature_stacker.py
similarity index 100%
rename from examples/feature_stacker.py
rename to examples/plot_feature_stacker.py
diff --git a/examples/missing_values.py b/examples/plot_missing_values.py
similarity index 92%
rename from examples/missing_values.py
rename to examples/plot_missing_values.py
index 8a0895f9a589f..17a256fa4fa2f 100644
--- a/examples/missing_values.py
+++ b/examples/plot_missing_values.py
@@ -3,10 +3,11 @@
 Imputing missing values before building an estimator
 ======================================================
 
-This example shows that imputing the missing values can give better results
-than discarding the samples containing any missing value.
-Imputing does not always improve the predictions, so please check via cross-validation.
-Sometimes dropping rows or using marker values is more effective.
+This example shows that imputing the missing values can give better
+results than discarding the samples containing any missing value.
+Imputing does not always improve the predictions, so please check via
+cross-validation.  Sometimes dropping rows or using marker values is
+more effective.
 
 Missing values can be replaced by the mean, the median or the most frequent
 value using the ``strategy`` hyper-parameter.
@@ -20,7 +21,7 @@
   Score after imputation of the missing values = 0.55
 
 In this case, imputing helps the classifier get close to the original score.
-  
+
 """
 import numpy as np
 
@@ -44,7 +45,7 @@
 
 # Add missing values in 75% of the lines
 missing_rate = 0.75
-n_missing_samples = np.floor(n_samples * missing_rate)
+n_missing_samples = int(np.floor(n_samples * missing_rate))
 missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                       dtype=np.bool),
                              np.ones(n_missing_samples,