From d7d92882e927db2521bc850e190f0070164032eb Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 7 Feb 2020 21:32:54 -0500 Subject: [PATCH 01/35] Add sklearn to docs requirements --- doc/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/requirements.txt b/doc/requirements.txt index 51c56a393a1..8b46ed96701 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -17,6 +17,7 @@ requests networkx squarify scikit-image +scikit-learn sphinx sphinx_bootstrap_theme recommonmark From 612c0f676cc19ad4c66265296f8a12f08884ca1a Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 7 Feb 2020 21:34:44 -0500 Subject: [PATCH 02/35] Create kNN docs draft --- doc/python/ml-knn.md | 119 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 doc/python/ml-knn.md diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md new file mode 100644 index 00000000000..78e04479db1 --- /dev/null +++ b/doc/python/ml-knn.md @@ -0,0 +1,119 @@ +## K-Nearest Neighbors (kNN) + +How to visualize the K-Nearest Neighbors (kNN) algorithm using scikit-learn. + + +### Binary Probability Estimates with `go.Contour` + +```python +import numpy as np +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier +import plotly.express as px +import plotly.graph_objects as go + +X, y = make_moons(noise=0.3, random_state=0) + +# Create a mesh grid on which we will run our model +x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin +y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Create classifier, run predictions on grid +clf = neighbors.KNeighborsClassifier(15, weights='uniform') +clf.fit(X, y) +Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] +Z = Z.reshape(xx.shape) + +fig = px.scatter(X, x=0, y=1, color=y.astype(str)) +fig.add_trace( + go.Contour( + x=xrange, + y=yrange, + z=Z, + showscale=False, + colorscale=['Blue', 'Red'], + opacity=0.4 + ) +) +``` + +### Multi-class classification with `px.data` and `go.Heatmap` + +```python +import numpy as np +from sklearn.neighbors import KNeighborsClassifier +import plotly.express as px +import plotly.graph_objects as go + +mesh_size = .02 +margin = 1 + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width']] +y = df.species_id + +# Create a mesh grid on which we will run our model +l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin +w_min, w_max = df.sepal_width.min() - margin, df.sepal_width.max() + margin +lrange = np.arange(l_min, l_max, mesh_size) +wrange = np.arange(w_min, w_max, mesh_size) +ll, ww = np.meshgrid(lrange, wrange) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X, y) +Z = clf.predict(np.c_[ll.ravel(), ww.ravel()]) +Z = Z.reshape(ll.shape) + +fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') +fig.update_traces(marker_size=10, marker_line_width=1) +fig.add_trace( + go.Heatmap( + x=lrange, + y=wrange, + z=Z, + showscale=False, + colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']], + opacity=0.25 + ) +) +``` + +### Visualizing kNN Regression + +```python +from sklearn.neighbors import KNeighborsRegressor +import plotly.express as px +import plotly.graph_objects as go + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +knn_dist = KNeighborsRegressor(10, weights='distance') +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_dist.fit(X, df.tip) +knn_uni.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_dist = knn_dist.predict(x_range.reshape(-1, 1)) +y_uni = knn_uni.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) +fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +``` + +### Reference + +Learn more about `px`, `go.Contour`, and `go.Heatmap` here: +* https://plot.ly/python/plotly-express/ +* https://plot.ly/python/heatmaps/ +* https://plot.ly/python/contour-plots/ + +This tutorial was inspired by amazing examples from the official scikit-learn docs: +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html +* https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html From 6b3bbb1ebb8f7312cffe58a13c3af72f1df26b82 Mon Sep 17 00:00:00 2001 From: Xing Han Date: Sat, 22 Feb 2020 15:52:39 -0500 Subject: [PATCH 03/35] Update based on Emma's suggestions --- doc/python/ml-knn.md | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 78e04479db1..031097a4404 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -7,10 +7,13 @@ How to visualize the K-Nearest Neighbors (kNN) algorithm using scikit-learn. ```python import numpy as np -from sklearn.datasets import make_moons -from sklearn.neighbors import KNeighborsClassifier import plotly.express as px import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier + +mesh_size = .02 +margin = 1 X, y = make_moons(noise=0.3, random_state=0) @@ -22,12 +25,12 @@ yrange = np.arange(y_min, y_max, mesh_size) xx, yy = np.meshgrid(xrange, yrange) # Create classifier, run predictions on grid -clf = neighbors.KNeighborsClassifier(15, weights='uniform') +clf = KNeighborsClassifier(15, weights='uniform') clf.fit(X, y) Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) -fig = px.scatter(X, x=0, y=1, color=y.astype(str)) +fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''}) fig.add_trace( go.Contour( x=xrange, @@ -38,15 +41,16 @@ fig.add_trace( opacity=0.4 ) ) +fig.show() ``` ### Multi-class classification with `px.data` and `go.Heatmap` ```python import numpy as np -from sklearn.neighbors import KNeighborsClassifier import plotly.express as px import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier mesh_size = .02 margin = 1 @@ -67,6 +71,8 @@ clf = KNeighborsClassifier(15, weights='distance') clf.fit(X, y) Z = clf.predict(np.c_[ll.ravel(), ww.ravel()]) Z = Z.reshape(ll.shape) +proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) +proba = proba.reshape(ll.shape + (3,)) fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') fig.update_traces(marker_size=10, marker_line_width=1) @@ -77,17 +83,27 @@ fig.add_trace( z=Z, showscale=False, colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']], - opacity=0.25 + opacity=0.25, + customdata=proba, + hovertemplate=( + 'sepal length: %{x}
' + 'sepal width: %{y}
' + 'p(setosa): %{customdata[0]:.3f}
' + 'p(versicolor): %{customdata[1]:.3f}
' + 'p(virginica): %{customdata[2]:.3f}' + ) ) ) +fig.show() ``` ### Visualizing kNN Regression ```python -from sklearn.neighbors import KNeighborsRegressor +import numpy as np import plotly.express as px import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor df = px.data.tips() X = df.total_bill.values.reshape(-1, 1) @@ -104,6 +120,7 @@ y_uni = knn_uni.predict(x_range.reshape(-1, 1)) fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig.show() ``` ### Reference From fbdd889de59efd6ce682906c435986dd1fda29aa Mon Sep 17 00:00:00 2001 From: Xing Han Date: Sat, 22 Feb 2020 16:44:08 -0500 Subject: [PATCH 04/35] Add a header --- doc/python/ml-knn.md | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 031097a4404..7e265ee8485 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -1,6 +1,43 @@ -## K-Nearest Neighbors (kNN) - -How to visualize the K-Nearest Neighbors (kNN) algorithm using scikit-learn. +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.6.10 + plotly: + description: How to visualize k-Nearest Neighbors (kNN) created using scikit-learn + in Python with Plotly. + display_as: basic + language: python + layout: base + name: k-Nearest Neighbors + order: 1 + page_type: example_index + permalink: python/knn/ + redirect_from: python/machine-learning-tutorials/ + thumbnail: thumbnail/line-and-scatter.jpg +--- + +## K-Nearest Neighbors (kNN) Classification + +How to visualize K-Nearest Neighbors (kNN) classification using scikit-learn. ### Binary Probability Estimates with `go.Contour` From b1d7fefce7ee603a1a4d3b14469633f89b975570 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Sun, 23 Feb 2020 01:26:10 -0500 Subject: [PATCH 05/35] Placeholder Regression Section --- doc/python/ml-regression.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 doc/python/ml-regression.md diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md new file mode 100644 index 00000000000..e2b0d37724d --- /dev/null +++ b/doc/python/ml-regression.md @@ -0,0 +1,36 @@ +# Regression + + +### Visualizing kNN Regression + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +knn_dist = KNeighborsRegressor(10, weights='distance') +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_dist.fit(X, df.tip) +knn_uni.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_dist = knn_dist.predict(x_range.reshape(-1, 1)) +y_uni = knn_uni.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) +fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig.show() +``` + +### Reference + +Learn more about `px` here: +* https://plot.ly/python/plotly-express/ + +This tutorial was inspired by amazing examples from the official scikit-learn docs: +* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html From eafaf2880f9a0c81762493a4384a1a12a9896b5b Mon Sep 17 00:00:00 2001 From: xhlulu Date: Sun, 23 Feb 2020 01:26:27 -0500 Subject: [PATCH 06/35] Create 2 basic sections, 2 advanced sections --- doc/python/ml-knn.md | 124 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 22 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 7e265ee8485..27ef20c7388 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -1,6 +1,7 @@ --- jupyter: jupytext: + formats: ipynb,md notebook_metadata_filter: all text_representation: extension: .md @@ -20,14 +21,14 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.6.10 + version: 3.7.6 plotly: description: How to visualize k-Nearest Neighbors (kNN) created using scikit-learn in Python with Plotly. display_as: basic language: python layout: base - name: k-Nearest Neighbors + name: K-Nearest Neighbors (kNN) Classification order: 1 page_type: example_index permalink: python/knn/ @@ -35,12 +36,49 @@ jupyter: thumbnail: thumbnail/line-and-scatter.jpg --- -## K-Nearest Neighbors (kNN) Classification +## Basic Binary Classification with `plotly.express` -How to visualize K-Nearest Neighbors (kNN) classification using scikit-learn. +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.neighbors import KNeighborsClassifier + +X, y = make_moons(noise=0.3, random_state=0) +X_test, _ = make_moons(noise=0.3, random_state=1) + +clf = KNeighborsClassifier(15) +clf.fit(X, y.astype(str)) # Fit on training set +y_pred = clf.predict(X_test) # Predict on new data + +fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'}) +fig.update_traces(marker_size=10) +fig.show() +``` +## Visualize Binary Prediction Scores + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_classification +from sklearn.neighbors import KNeighborsClassifier + +X, y = make_classification(n_features=2, n_redundant=0, random_state=0) +X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1) + +clf = KNeighborsClassifier(15) +clf.fit(X, y) # Fit on training set +y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data + +fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'}) +fig.update_traces(marker_size=10) +fig.show() +``` -### Binary Probability Estimates with `go.Contour` +## Probability Estimates with `go.Contour` ```python import numpy as np @@ -68,6 +106,7 @@ Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''}) +fig.update_traces(marker_size=10, marker_line_width=1) fig.add_trace( go.Contour( x=xrange, @@ -75,13 +114,14 @@ fig.add_trace( z=Z, showscale=False, colorscale=['Blue', 'Red'], - opacity=0.4 + opacity=0.4, + name='Confidence' ) ) fig.show() ``` -### Multi-class classification with `px.data` and `go.Heatmap` +## Multi-class prediction confidence with `go.Heatmap` ```python import numpy as np @@ -92,6 +132,7 @@ from sklearn.neighbors import KNeighborsClassifier mesh_size = .02 margin = 1 +# We will use the iris data, which is included in px df = px.data.iris() X = df[['sepal_length', 'sepal_width']] y = df.species_id @@ -134,29 +175,66 @@ fig.add_trace( fig.show() ``` -### Visualizing kNN Regression +## 3D Classification with `px.scatter_3d` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import train_test_split + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width"] + +X = df[features] +y = df.species +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) +y_score = clf.predict_proba(X_test) +y_score = np.around(y_score.max(axis=1), 4) + +fig = px.scatter_3d( + X_test, + x='sepal_length', + y='sepal_width', + z='petal_width', + symbol=y_pred, + color=y_score, + labels={'symbol': 'prediction', 'color': 'score'} +) +fig.update_layout(legend=dict(x=0, y=0)) +fig.show() +``` + +## High Dimension Visualization with `px.scatter_matrix` + +If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/). ```python import numpy as np import plotly.express as px import plotly.graph_objects as go -from sklearn.neighbors import KNeighborsRegressor +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import train_test_split -df = px.data.tips() -X = df.total_bill.values.reshape(-1, 1) +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] -knn_dist = KNeighborsRegressor(10, weights='distance') -knn_uni = KNeighborsRegressor(10, weights='uniform') -knn_dist.fit(X, df.tip) -knn_uni.fit(X, df.tip) +X = df[features] +y = df.species +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) -x_range = np.linspace(X.min(), X.max(), 100) -y_dist = knn_dist.predict(x_range.reshape(-1, 1)) -y_uni = knn_uni.predict(x_range.reshape(-1, 1)) +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='distance') +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) -fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) -fig.add_traces(go.Scatter(x=x_range, y=y_uni, name='Weights: Uniform')) -fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) +fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'}) fig.show() ``` @@ -166,8 +244,10 @@ Learn more about `px`, `go.Contour`, and `go.Heatmap` here: * https://plot.ly/python/plotly-express/ * https://plot.ly/python/heatmaps/ * https://plot.ly/python/contour-plots/ +* https://plot.ly/python/3d-scatter-plots/ +* https://plot.ly/python/splom/ This tutorial was inspired by amazing examples from the official scikit-learn docs: -* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html * https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html * https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html +* https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html From 08aa89b04ed846741d5914444778bb378ff6db6f Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 28 Feb 2020 12:40:16 -0500 Subject: [PATCH 07/35] KNN ML docs: Update thumbnail, name, permalink, description, display_as --- doc/python/ml-knn.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 27ef20c7388..2bcab469875 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -1,7 +1,6 @@ --- jupyter: jupytext: - formats: ipynb,md notebook_metadata_filter: all text_representation: extension: .md @@ -23,17 +22,16 @@ jupyter: pygments_lexer: ipython3 version: 3.7.6 plotly: - description: How to visualize k-Nearest Neighbors (kNN) created using scikit-learn - in Python with Plotly. - display_as: basic + description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification + with Plotly + display_as: ai_ml language: python layout: base - name: K-Nearest Neighbors (kNN) Classification + name: kNN Classification order: 1 page_type: example_index - permalink: python/knn/ - redirect_from: python/machine-learning-tutorials/ - thumbnail: thumbnail/line-and-scatter.jpg + permalink: python/knn-classification/ + thumbnail: thumbnail/knn-classification.png --- ## Basic Binary Classification with `plotly.express` @@ -152,7 +150,7 @@ Z = Z.reshape(ll.shape) proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) proba = proba.reshape(ll.shape + (3,)) -fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') +fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000) fig.update_traces(marker_size=10, marker_line_width=1) fig.add_trace( go.Heatmap( From be71cfe5d79c99a3fc99e721d771140231946874 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 28 Feb 2020 16:55:42 -0500 Subject: [PATCH 08/35] Added 3 sections, drafted out 2 sections --- doc/python/ml-regression.md | 157 +++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 2 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index e2b0d37724d..3e34f73de3f 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -1,7 +1,91 @@ -# Regression +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize regression in scikit-learn with Plotly + display_as: ai_ml + language: python + layout: base + name: ML Regression + order: 2 + page_type: example_index + permalink: python/ml-regression/ + thumbnail: thumbnail/knn-classification.png +--- +## Basic linear regression -### Visualizing kNN Regression +This example shows how to train a simple linear regression from `sklearn` to predicts the tips servers will receive based on the value of the total bill (dataset is included in `px.data`). + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) + +model = LinearRegression() +model.fit(X, df.tip) + +x_range = np.linspace(X.min(), X.max(), 100) +y_range = model.predict(x_range.reshape(-1, 1)) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit')) +fig.show() +``` + +## Model generalization on unseen data + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +X_train, X_test, y_train, y_test = train_test_split(X, df.tip, random_state=0) + +model = LinearRegression() +model.fit(X_train, y_train) + +x_range = np.linspace(X.min(), X.max(), 100) +y_range = model.predict(x_range.reshape(-1, 1)) + + +fig = go.Figure([ + go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'), + go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'), + go.Scatter(x=x_range, y=y_range, name='prediction') +]) +fig.show() +``` + +## Comparing different kNN models parameters ```python import numpy as np @@ -27,6 +111,75 @@ fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) fig.show() ``` +## 3D regression surface with `px.scatter_3d` and `go.Surface` + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.neighbors import KNeighborsRegressor + +mesh_size = .02 +margin = 0 + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width"] + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] + +# Condition the model on sepal width and length, predict the petal width +knn = KNeighborsRegressor(10, weights='distance') +knn.fit(X, y) + +# Create a mesh grid on which we will run our model +x_min, x_max = X.sepal_width.min() - margin, X.sepal_width.max() + margin +y_min, y_max = X.sepal_length.min() - margin, X.sepal_length.max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Run kNN +pred = knn.predict(np.c_[xx.ravel(), yy.ravel()]) +pred = pred.reshape(xx.shape) + +# Generate the plot +fig = px.scatter_3d(df, x='sepal_width', y='sepal_length', z='petal_width') +fig.update_traces(marker=dict(size=5)) +fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) +fig.show() +``` + +## Label polynomial fits with latex + +```python + +``` + +## Prediction Error Plots + + +### Simple Prediction Error + +```python + +``` + +### Augmented Prediction Error plot using `px` + +```python + +``` + +### Grid Search Visualization using `px.scatter_matrix` + + +## Residual Plots + +```python + +``` + ### Reference Learn more about `px` here: From 61b3ad8e4d73ff6dd4d529c2499e7621719fec7d Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 15:39:38 -0500 Subject: [PATCH 09/35] ML Docs: Added 3 new sections to regression notebook --- doc/python/ml-regression.md | 206 ++++++++++++++++++++++++++++++++++-- 1 file changed, 199 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 3e34f73de3f..2e0087982bd 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -33,9 +33,28 @@ jupyter: thumbnail: thumbnail/knn-classification.png --- -## Basic linear regression +## Basic linear regression plots -This example shows how to train a simple linear regression from `sklearn` to predicts the tips servers will receive based on the value of the total bill (dataset is included in `px.data`). + +### Ordinary Least Square (OLS) with `plotly.express` + + +This example shows how to use `plotly.express` to train a simply Ordinary Least Square (OLS) that can predict the tips servers will receive based on the value of the total bill. + +```python +import plotly.express as px + +df = px.data.tips() +fig = px.scatter( + df, x='total_bill', y='tip', opacity=0.65, + trendline='ols', trendline_color_override='red' +) +fig.show() +``` + +### Linear Regression with scikit-learn + +You can also perform the same prediction using scikit-learn's `LinearRegression`. ```python import numpy as np @@ -123,7 +142,6 @@ mesh_size = .02 margin = 0 df = px.data.iris() -features = ["sepal_width", "sepal_length", "petal_width"] X = df[['sepal_width', 'sepal_length']] y = df['petal_width'] @@ -150,10 +168,46 @@ fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) fig.show() ``` -## Label polynomial fits with latex +## Displaying `PolynomialFeatures` using $\LaTeX$ + +It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. ```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures + +def format_coefs(coefs): + equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] + equation = "$" + " + ".join(equation_list) + "$" + + replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} + for old, new in replace_map.items(): + equation = equation.replace(old, new) + + return equation +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +for n_features in [1, 2, 3, 4]: + poly = PolynomialFeatures(n_features) + poly.fit(X) + X_poly = poly.transform(X) + x_range_poly = poly.transform(x_range) + + model = LinearRegression(fit_intercept=False) + model.fit(X_poly, df.tip) + y_poly = model.predict(x_range_poly) + + equation = format_coefs(model.coef_.round(2)) + fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) + +fig.show() ``` ## Prediction Error Plots @@ -162,22 +216,160 @@ fig.show() ### Simple Prediction Error ```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +df = px.data.iris() +X = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X, y) +y_pred = model.predict(X) + +fig = px.scatter(x=y, y=y_pred, labels={'x': 'y true', 'y': 'y pred'}) +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=y.min(), y0=y.min(), + x1=y.max(), y1=y.max() +) +fig.show() ``` -### Augmented Prediction Error plot using `px` +### Augmented Prediction Error analysis using `plotly.express` ```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split -``` +df = px.data.iris() -### Grid Search Visualization using `px.scatter_matrix` +# Split data into training and test splits +train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0) +df['split'] = 'train' +df.loc[test_idx, 'split'] = 'test' +X = df[['sepal_width', 'sepal_length']] +X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y_train = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X_train, y_train) +df['prediction'] = model.predict(X) + +fig = px.scatter( + df, x='petal_width', y='prediction', + marginal_x='histogram', marginal_y='histogram', + color='split', trendline='ols' +) +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=y.min(), y0=y.min(), + x1=y.max(), y1=y.max() +) + +fig.show() +``` ## Residual Plots +Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using `plotly.express` built-in capabilities. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split + +df = px.data.iris() + +# Split data into training and test splits +train_idx, test_idx = train_test_split(df.index, test_size=.25, random_state=0) +df['split'] = 'train' +df.loc[test_idx, 'split'] = 'test' + +X = df[['sepal_width', 'sepal_length']] +X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y_train = df.loc[train_idx, 'petal_width'] + +# Condition the model on sepal width and length, predict the petal width +model = LinearRegression() +model.fit(X_train, y_train) +df['prediction'] = model.predict(X) +df['residual'] = df['prediction'] - df['petal_width'] + +fig = px.scatter( + df, x='prediction', y='residual', + marginal_y='violin', + color='split', trendline='ols' +) +fig.show() +``` + +## Grid Search Visualization using `px` facets + ```python +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from sklearn.model_selection import GridSearchCV +from sklearn.tree import DecisionTreeRegressor +N_FOLD = 5 + +df = px.data.iris() +X = df.loc[train_idx, ['sepal_width', 'sepal_length']] +y = df.loc[train_idx, 'petal_width'] + +model = DecisionTreeRegressor() +param_grid = { + 'criterion': ['mse', 'friedman_mse', 'mae'], + 'max_depth': range(2, 5) +} +grid = GridSearchCV(model, param_grid, cv=N_FOLD) + +grid.fit(X, y) +grid_df = pd.DataFrame(grid.cv_results_) + +# Convert the wide format of the grid into the long format +# accepted by plotly.express +melted = ( + grid_df + .rename(columns=lambda col: col.replace('param_', '')) + .melt( + value_vars=[f'split{i}_test_score' for i in range(N_FOLD)], + id_vars=['rank_test_score', 'mean_test_score', + 'mean_fit_time', 'criterion', 'max_depth'] + ) +) + +# Convert R-Squared measure to % +melted[['value', 'mean_test_score']] *= 100 + +# Format the variable names for simplicity +melted['variable'] = ( + melted['variable'] + .str.replace('_test_score', '') + .str.replace('split', '') +) + +px.bar( + melted, x='variable', y='value', + color='mean_test_score', + facet_row='max_depth', + facet_col='criterion', + title='Test Scores of Grid Search', + hover_data=['mean_fit_time', 'rank_test_score'], + labels={'variable': 'cv_split', + 'value': 'r_squared', + 'mean_test_score': "mean_r_squared"} +) ``` ### Reference From 86e987b380a3c8951ee28ed7dd33b26db307366e Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 16:48:38 -0500 Subject: [PATCH 10/35] ML Docs: Updated last ML regression section for clarity --- doc/python/ml-regression.md | 70 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 2e0087982bd..3c9a2326188 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -213,7 +213,7 @@ fig.show() ## Prediction Error Plots -### Simple Prediction Error +### Simple actual vs predicted plot ```python import plotly.express as px @@ -221,8 +221,8 @@ import plotly.graph_objects as go from sklearn.linear_model import LinearRegression df = px.data.iris() -X = df.loc[train_idx, ['sepal_width', 'sepal_length']] -y = df.loc[train_idx, 'petal_width'] +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] # Condition the model on sepal width and length, predict the petal width model = LinearRegression() @@ -238,7 +238,7 @@ fig.add_shape( fig.show() ``` -### Augmented Prediction Error analysis using `plotly.express` +### Augmented prediction error analysis using `plotly.express` ```python import plotly.express as px @@ -276,7 +276,7 @@ fig.add_shape( fig.show() ``` -## Residual Plots +## Residual plots Just like prediction error plots, it's easy to visualize your prediction residuals in just a few lines of codes using `plotly.express` built-in capabilities. @@ -312,28 +312,34 @@ fig = px.scatter( fig.show() ``` -## Grid Search Visualization using `px` facets +## Grid search visualization using `px.density_heatmap` and `px.box` + +In this example, we show how to visualize the results of a grid search on a `DecisionTreeRegressor`. The first plot shows how to visualize the score of each model parameter on individual splits (grouped using facets). The second plot aggregates the results of all splits such that each box represents a single model. ```python +import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeRegressor -N_FOLD = 5 +N_FOLD = 6 +# Load and shuffle dataframe df = px.data.iris() -X = df.loc[train_idx, ['sepal_width', 'sepal_length']] -y = df.loc[train_idx, 'petal_width'] +df = df.sample(frac=1, random_state=0) + +X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] +# Define and fit the grid model = DecisionTreeRegressor() param_grid = { 'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': range(2, 5) } grid = GridSearchCV(model, param_grid, cv=N_FOLD) - grid.fit(X, y) grid_df = pd.DataFrame(grid.cv_results_) @@ -344,32 +350,42 @@ melted = ( .rename(columns=lambda col: col.replace('param_', '')) .melt( value_vars=[f'split{i}_test_score' for i in range(N_FOLD)], - id_vars=['rank_test_score', 'mean_test_score', - 'mean_fit_time', 'criterion', 'max_depth'] + id_vars=['mean_test_score', 'mean_fit_time', 'criterion', 'max_depth'], + var_name="cv_split", + value_name="r_squared" ) ) -# Convert R-Squared measure to % -melted[['value', 'mean_test_score']] *= 100 - # Format the variable names for simplicity -melted['variable'] = ( - melted['variable'] +melted['cv_split'] = ( + melted['cv_split'] .str.replace('_test_score', '') .str.replace('split', '') ) -px.bar( - melted, x='variable', y='value', - color='mean_test_score', - facet_row='max_depth', - facet_col='criterion', - title='Test Scores of Grid Search', - hover_data=['mean_fit_time', 'rank_test_score'], - labels={'variable': 'cv_split', - 'value': 'r_squared', - 'mean_test_score': "mean_r_squared"} +# Single function call to plot each figure +fig_hmap = px.density_heatmap( + melted, x="max_depth", y='criterion', + histfunc="sum", z="r_squared", + title='Grid search results on individual fold', + hover_data=['mean_fit_time'], + facet_col="cv_split", facet_col_wrap=3, + labels={'mean_test_score': "mean_r_squared"} ) + +fig_box = px.box( + melted, x='max_depth', y='r_squared', + title='Grid search results ', + hover_data=['mean_fit_time'], + points='all', + color="criterion", + hover_name='cv_split', + labels={'mean_test_score': "mean_r_squared"} +) + +# Display +fig_hmap.show() +fig_box.show() ``` ### Reference From 1e4a00805aeaa6b3dfcfa61312a9e8783edb072f Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 17:14:44 -0500 Subject: [PATCH 11/35] ML Docs: Added annotations after each section of regression notebook --- doc/python/ml-regression.md | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 3c9a2326188..6414dbf43a9 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -78,6 +78,8 @@ fig.show() ## Model generalization on unseen data +Easily color your plot based on a predefined data split. + ```python import numpy as np import plotly.express as px @@ -106,6 +108,8 @@ fig.show() ## Comparing different kNN models parameters +Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`. + ```python import numpy as np import plotly.express as px @@ -114,14 +118,16 @@ from sklearn.neighbors import KNeighborsRegressor df = px.data.tips() X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100) +# Model #1 knn_dist = KNeighborsRegressor(10, weights='distance') -knn_uni = KNeighborsRegressor(10, weights='uniform') knn_dist.fit(X, df.tip) -knn_uni.fit(X, df.tip) - -x_range = np.linspace(X.min(), X.max(), 100) y_dist = knn_dist.predict(x_range.reshape(-1, 1)) + +# Model #2 +knn_uni = KNeighborsRegressor(10, weights='uniform') +knn_uni.fit(X, df.tip) y_uni = knn_uni.predict(x_range.reshape(-1, 1)) fig = px.scatter(df, x='total_bill', y='tip', color='sex', opacity=0.65) @@ -132,6 +138,8 @@ fig.show() ## 3D regression surface with `px.scatter_3d` and `go.Surface` +Visualize the decision plane of your model whenever you have more than one variable in your `X`. + ```python import numpy as np import plotly.express as px @@ -229,7 +237,7 @@ model = LinearRegression() model.fit(X, y) y_pred = model.predict(X) -fig = px.scatter(x=y, y=y_pred, labels={'x': 'y true', 'y': 'y pred'}) +fig = px.scatter(x=y_pred, y=y, labels={'x': 'prediction', 'y': 'actual'}) fig.add_shape( type="line", line=dict(dash='dash'), x0=y.min(), y0=y.min(), @@ -238,7 +246,9 @@ fig.add_shape( fig.show() ``` -### Augmented prediction error analysis using `plotly.express` +### Enhanced prediction error analysis using `plotly.express` + +Add marginal histograms to quickly diagnoses any prediction bias your model might have. The built-in `OLS` functionality let you visualize how well your model generalizes by comparing it with the theoretical optimal fit (black dotted line). ```python import plotly.express as px @@ -254,6 +264,7 @@ df['split'] = 'train' df.loc[test_idx, 'split'] = 'test' X = df[['sepal_width', 'sepal_length']] +y = df['petal_width'] X_train = df.loc[train_idx, ['sepal_width', 'sepal_length']] y_train = df.loc[train_idx, 'petal_width'] @@ -263,7 +274,7 @@ model.fit(X_train, y_train) df['prediction'] = model.predict(X) fig = px.scatter( - df, x='petal_width', y='prediction', + df, x='prediction', y='petal_width', marginal_x='histogram', marginal_y='histogram', color='split', trendline='ols' ) From 1de7a14986e4e53fc29307caf9d577c08ac49ac2 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 17:31:59 -0500 Subject: [PATCH 12/35] ML Docs: updated ml regression header --- doc/python/ml-regression.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 6414dbf43a9..968858ec64b 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -22,7 +22,7 @@ jupyter: pygments_lexer: ipython3 version: 3.7.6 plotly: - description: Visualize regression in scikit-learn with Plotly + description: Visualize regression in scikit-learn with Plotly. display_as: ai_ml language: python layout: base @@ -30,7 +30,7 @@ jupyter: order: 2 page_type: example_index permalink: python/ml-regression/ - thumbnail: thumbnail/knn-classification.png + thumbnail: thumbnail/ml-regression.png --- ## Basic linear regression plots From a28ee1fb6d56550428b6a9fcb6c1247258efc0d2 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Mon, 2 Mar 2020 21:11:47 -0500 Subject: [PATCH 13/35] ML Docs: Added new section to regression, updated references --- doc/python/ml-regression.md | 76 ++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 968858ec64b..f345cc22445 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -323,6 +323,58 @@ fig = px.scatter( fig.show() ``` +## Regularization visualization + + +### Plot alphas for individual folds + +```python +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LassoCV + +# Load and preprocess the data +df = px.data.gapminder() +X = df.drop(columns=['lifeExp', 'iso_num']) +X = pd.get_dummies(X, columns=['country', 'continent', 'iso_alpha']) +y = df['lifeExp'] + +# Train model to predict life expectancy +model = LassoCV(cv=N_FOLD, normalize=True) +model.fit(X, y) +mean_alphas = model.mse_path_.mean(axis=-1) + +fig = go.Figure([ + go.Scatter( + x=model.alphas_, y=model.mse_path_[:, i], + name=f"Fold: {i+1}", opacity=.5, line=dict(dash='dash'), + hovertemplate="alpha: %{x}
MSE: %{y}" + ) + for i in range(N_FOLD) +]) +fig.add_traces(go.Scatter( + x=model.alphas_, y=mean_alphas, + name='Mean', line=dict(color='black', width=3), + hovertemplate="alpha: %{x}
MSE: %{y}", +)) + +fig.add_shape( + type="line", line=dict(dash='dash'), + x0=model.alpha_, y0=0, + x1=model.alpha_, y1=1, + yref='paper' +) + +fig.update_layout( + xaxis_title='alpha', + xaxis_type="log", + yaxis_title="Mean Square Error (MSE)" +) +fig.show() +``` + ## Grid search visualization using `px.density_heatmap` and `px.box` In this example, we show how to visualize the results of a grid search on a `DecisionTreeRegressor`. The first plot shows how to visualize the score of each model parameter on individual splits (grouped using facets). The second plot aggregates the results of all splits such that each box represents a single model. @@ -401,8 +453,22 @@ fig_box.show() ### Reference -Learn more about `px` here: -* https://plot.ly/python/plotly-express/ - -This tutorial was inspired by amazing examples from the official scikit-learn docs: -* https://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html +Learn more about the `px` figures used in this tutorial: +* Plotly Express: https://plot.ly/python/plotly-express/ +* Vertical Lines: https://plot.ly/python/shapes/ +* Heatmaps: https://plot.ly/python/heatmaps/ +* Box Plots: https://plot.ly/python/box-plots/ +* 3D Scatter: https://plot.ly/python/3d-scatter-plots/ +* Surface Plots: https://plot.ly/python/3d-surface-plots/ + +Learn more about the Machine Learning models used in this tutorial: +* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html +* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html +* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html +* https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html +* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html + +Other tutorials that inspired this notebook: +* https://seaborn.pydata.org/examples/residplot.html +* https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html +* http://www.scikit-yb.org/zh/latest/api/regressor/peplot.html From 0df5bcb3f1cbc61301adbc669f3092496b865002 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 6 Mar 2020 15:05:49 -0500 Subject: [PATCH 14/35] ML Docs: Added coefficient MLR example --- doc/python/ml-regression.md | 101 ++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index f345cc22445..e37e1d04e95 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -39,7 +39,7 @@ jupyter: ### Ordinary Least Square (OLS) with `plotly.express` -This example shows how to use `plotly.express` to train a simply Ordinary Least Square (OLS) that can predict the tips servers will receive based on the value of the total bill. +This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips servers will receive based on the value of the total bill. ```python import plotly.express as px @@ -108,7 +108,7 @@ fig.show() ## Comparing different kNN models parameters -Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`. +Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`, such as coloring by the assigned `sex`. ```python import numpy as np @@ -136,9 +136,51 @@ fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) fig.show() ``` +## Displaying `PolynomialFeatures` using $\LaTeX$ + +It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures + +def format_coefs(coefs): + equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] + equation = "$" + " + ".join(equation_list) + "$" + + replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} + for old, new in replace_map.items(): + equation = equation.replace(old, new) + + return equation + +df = px.data.tips() +X = df.total_bill.values.reshape(-1, 1) +x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) + +fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) +for n_features in [1, 2, 3, 4]: + poly = PolynomialFeatures(n_features) + poly.fit(X) + X_poly = poly.transform(X) + x_range_poly = poly.transform(x_range) + + model = LinearRegression(fit_intercept=False) + model.fit(X_poly, df.tip) + y_poly = model.predict(x_range_poly) + + equation = format_coefs(model.coef_.round(2)) + fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) + +fig.show() +``` + ## 3D regression surface with `px.scatter_3d` and `go.Surface` -Visualize the decision plane of your model whenever you have more than one variable in your `X`. +Visualize the decision plane of your model whenever you have more than one variable in your input data. ```python import numpy as np @@ -176,53 +218,44 @@ fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface')) fig.show() ``` -## Displaying `PolynomialFeatures` using $\LaTeX$ +## Visualizing coefficients for multiple linear regression (MLR) -It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. +When you are fitting a linear regression, you want to often know what feature matters the most in your regression's output. ```python -import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LinearRegression -from sklearn.preprocessing import PolynomialFeatures -def format_coefs(coefs): - equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] - equation = "$" + " + ".join(equation_list) + "$" - - replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} - for old, new in replace_map.items(): - equation = equation.replace(old, new) - - return equation +df = px.data.iris() -df = px.data.tips() -X = df.total_bill.values.reshape(-1, 1) -x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) +X = df.drop(columns=['petal_width', 'species_id']) +X = pd.get_dummies(X, columns=['species'], prefix_sep='=') +y = df['petal_width'] -fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) -for n_features in [1, 2, 3, 4]: - poly = PolynomialFeatures(n_features) - poly.fit(X) - X_poly = poly.transform(X) - x_range_poly = poly.transform(x_range) +model = LinearRegression() +model.fit(X, y) - model = LinearRegression(fit_intercept=False) - model.fit(X_poly, df.tip) - y_poly = model.predict(x_range_poly) - - equation = format_coefs(model.coef_.round(2)) - fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) +colors = ['Positive' if c > 0 else 'Negative' for c in model.coef_] +fig = px.bar( + x=X.columns, y=model.coef_, color=colors, + color_discrete_sequence=['red', 'blue'], + labels=dict(x='Feature', y='Linear coefficient'), + title='Weight of each feature for predicting petal width' +) fig.show() ``` ## Prediction Error Plots +When you are working with very high-dimensional data, it is inconvenient to plot every dimension with your output `y`. Instead, you can use methods such as prediction error plots, which let you visualize how well your model does compared to the ground truth. + ### Simple actual vs predicted plot +This example shows you the simplest way to compare the predicted output vs. the actual output. A good model will have most of the scatter dots near the diagonal black line. + ```python import plotly.express as px import plotly.graph_objects as go @@ -323,10 +356,10 @@ fig = px.scatter( fig.show() ``` -## Regularization visualization +## Visualize regularization across different cross-validation folds -### Plot alphas for individual folds +In this example, we show how to plot the results of various $\alpha$ penalization values from the results of cross-validation using scikit-learn's `LassoCV`. This is useful to see how much the error of the optimal alpha actually varies across CV folds. ```python import pandas as pd @@ -335,6 +368,8 @@ import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LassoCV +N_FOLD = 6 + # Load and preprocess the data df = px.data.gapminder() X = df.drop(columns=['lifeExp', 'iso_num']) From 8e4dad233e0b4a03ac031db51f051e1b3a055132 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 6 Mar 2020 16:18:32 -0500 Subject: [PATCH 15/35] ML Docs: Start pca notebook --- doc/python/ml-pca.md | 135 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 doc/python/ml-pca.md diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md new file mode 100644 index 00000000000..105edd8af66 --- /dev/null +++ b/doc/python/ml-pca.md @@ -0,0 +1,135 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize Principle Component Analysis (PCA) of your high-dimensional + data with Plotly on Python. + display_as: ai_ml + language: python + layout: base + name: PCA Visualization + order: 4 + page_type: example_index + permalink: python/pca-visualization/ + thumbnail: thumbnail/ml-pca.png +--- + +## Basic PCA Scatter Plot + +This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=2) +components = pca.fit_transform(X) + +fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species']) +fig.show() +``` + +## Visualize PCA with `px.scatter_3d` + +Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=3) +components = pca.fit_transform(X) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +fig = px.scatter_3d( + x=components[:, 0], y=components[:, 1], z=components[:, 2], + color=df['species'], + title=f'Total Explained Variance: {total_var:.2f}%', + labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, +) +fig.show() +``` + +## Plot high-dimensional components with `px.scatter_matrix` + +If you need to visualize more than 3 dimensions, you can use scatter plot matrices. + +```python +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.datasets import load_boston + +boston = load_boston() +df = pd.DataFrame(boston.data, columns=boston.feature_names) + +pca = PCA(n_components=5) +components = pca.fit_transform(df) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +labels = {str(i): f"PC {i+1}" for i in range(5)} +labels['color'] = 'Median Price' + +fig = px.scatter_matrix( + components, + color=boston.target, + dimensions=range(5), + labels=labels, + title=f'Total Explained Variance: {total_var:.2f}%', +) +fig.update_traces(diagonal_visible=False) +fig.show() +``` + +## Plotting explained variance + +Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). + +```python +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.datasets import load_diabetes + +boston = load_diabetes() +df = pd.DataFrame(boston.data, columns=boston.feature_names) + +pca = PCA() +pca.fit(df) +exp_var_cumul = np.cumsum(pca.explained_variance_ratio_) + +px.area( + x=range(1, exp_var_cumul.shape[0] + 1), + y=exp_var_cumul, + labels={"x": "# Components", "y": "Explained Variance"} +) +``` + +## Visualize loadings From 4b7143061f0fe58289f5d796b8fc960f82844638 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Fri, 6 Mar 2020 18:06:03 -0500 Subject: [PATCH 16/35] ML Docs: Start ROC/PR section --- doc/python/ml-roc-pr.md | 201 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 doc/python/ml-roc-pr.md diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md new file mode 100644 index 00000000000..8c1bc6becb4 --- /dev/null +++ b/doc/python/ml-roc-pr.md @@ -0,0 +1,201 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Interpret the results of your classification using Receiver Operating + Characteristics (ROC) and Precision-Recall (PR) Curves using Plotly on Python. + display_as: ai_ml + language: python + layout: base + name: ROC and PR Curves + order: 3 + page_type: example_index + permalink: python/roc-and-pr-curves/ + thumbnail: thumbnail/ml-roc-pr.png +--- + +## Basic Binary ROC Curve + +```python +import plotly.express as px +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] + +fpr, tpr, thresholds = roc_curve(y, y_score) + +fig = px.area( + x=fpr, y=tpr, + title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})', + labels=dict(x='False Positive Rate', y='True Positive Rate') +) +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=0, y1=1 +) +fig.show() +``` + +## Multiclass ROC Curve + +When you have more than 2 classes, you will need to plot the ROC curve for each class separately. Make sure that you use a [one-versus-rest](https://scikit-learn.org/stable/modules/multiclass.html#one-vs-the-rest) model, or make sure that your problem has a [multi-label](https://scikit-learn.org/stable/modules/multiclass.html#multilabel-classification-format) format; otherwise, your ROC curve might not return the expected results. + +```python +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, roc_auc_score +import plotly.graph_objects as go +import plotly.express as px + +np.random.seed(0) + +# Artificially add noise to make task harder +df = px.data.iris() +samples = df.species.sample(n=50, random_state=0) +np.random.shuffle(samples.values) +df.loc[samples.index, 'species'] = samples.values + +# Define the inputs and outputs +X = df.drop(columns=['species', 'species_id']) +y = df['species'] +y_onehot = pd.get_dummies(y, columns=model.classes_) + +# Fit the model +model = LogisticRegression(max_iter=200) +model.fit(X, y) +y_scores = model.predict_proba(X) + +# Create an empty figure, and iteratively add new lines +# every time we compute a new class +fig = go.Figure() +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=0, y1=1 +) + +for i in range(y_scores.shape[1]): + y_true = y_onehot.iloc[:, i] + y_score = y_scores[:, i] + + fpr, tpr, _ = roc_curve(y_true, y_score) + auc_score = roc_auc_score(y_true, y_score) + + name = f"{y_onehot.columns[i]} (AUC={auc_score:.2f})" + fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) + +fig.update_layout( + xaxis_title='False Positive Rate', + yaxis_title='True Positive Rate' +) +fig.show() +``` + +## Precision-Recall Curves + +Plotting the PR curve is very similar to plotting the ROC curve. The following examples are slightly modified from the previous examples: + +```python +import plotly.express as px +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] + +precision, recall, thresholds = precision_recall_curve(y, y_score) + +fig = px.area( + x=recall, y=precision, + title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})', + labels=dict(x='Recall', y='Precision') +) +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=1, y1=0 +) +fig.show() +``` + +In this example, we use the [average precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html) metric, which is an alternative scoring method to the area under the PR curve. + +```python +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_curve, average_precision_score +import plotly.graph_objects as go +import plotly.express as px + +np.random.seed(0) + +# Artificially add noise to make task harder +df = px.data.iris() +samples = df.species.sample(n=30, random_state=0) +np.random.shuffle(samples.values) +df.loc[samples.index, 'species'] = samples.values + +# Define the inputs and outputs +X = df.drop(columns=['species', 'species_id']) +y = df['species'] +y_onehot = pd.get_dummies(y, columns=model.classes_) + +# Fit the model +model = LogisticRegression(max_iter=200) +model.fit(X, y) +y_scores = model.predict_proba(X) + +# Create an empty figure, and iteratively add new lines +# every time we compute a new class +fig = go.Figure() +fig.add_shape( + type='line', line=dict(dash='dash'), + x0=0, x1=1, y0=1, y1=0 +) + +for i in range(y_scores.shape[1]): + y_true = y_onehot.iloc[:, i] + y_score = y_scores[:, i] + + precision, recall, _ = precision_recall_curve(y_true, y_score) + auc_score = average_precision_score(y_true, y_score) + + name = f"{y_onehot.columns[i]} (AP={auc_score:.2f})" + fig.add_trace(go.Scatter(x=recall, y=precision, name=name, mode='lines')) + +fig.update_layout( + xaxis_title='Recall', + yaxis_title='Precision' +) +fig.show() +``` From ca2494980ccd67e338c5791a6a4e70eb3a4bd42c Mon Sep 17 00:00:00 2001 From: xhlu Date: Fri, 13 Mar 2020 15:21:35 -0400 Subject: [PATCH 17/35] ML Docs: Remove 2 sections Removed: * 3D scatter * Splom --- doc/python/ml-knn.md | 84 +++++++++----------------------------------- 1 file changed, 16 insertions(+), 68 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 2bcab469875..adac06295f2 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -34,7 +34,20 @@ jupyter: thumbnail: thumbnail/knn-classification.png --- -## Basic Binary Classification with `plotly.express` +## Basic binary classification with kNN + + +### Display training and test splits + +```python + +``` + +### Visualize predictions on test split + +```python + +``` ```python import numpy as np @@ -113,7 +126,7 @@ fig.add_trace( showscale=False, colorscale=['Blue', 'Red'], opacity=0.4, - name='Confidence' + name='Score' ) ) fig.show() @@ -150,7 +163,7 @@ Z = Z.reshape(ll.shape) proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) proba = proba.reshape(ll.shape + (3,)) -fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species', width=1000, height=1000) +fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') fig.update_traces(marker_size=10, marker_line_width=1) fig.add_trace( go.Heatmap( @@ -173,77 +186,12 @@ fig.add_trace( fig.show() ``` -## 3D Classification with `px.scatter_3d` - -```python -import numpy as np -import plotly.express as px -import plotly.graph_objects as go -from sklearn.neighbors import KNeighborsClassifier -from sklearn.model_selection import train_test_split - -df = px.data.iris() -features = ["sepal_width", "sepal_length", "petal_width"] - -X = df[features] -y = df.species -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) - -# Create classifier, run predictions on grid -clf = KNeighborsClassifier(15, weights='distance') -clf.fit(X_train, y_train) -y_pred = clf.predict(X_test) -y_score = clf.predict_proba(X_test) -y_score = np.around(y_score.max(axis=1), 4) - -fig = px.scatter_3d( - X_test, - x='sepal_length', - y='sepal_width', - z='petal_width', - symbol=y_pred, - color=y_score, - labels={'symbol': 'prediction', 'color': 'score'} -) -fig.update_layout(legend=dict(x=0, y=0)) -fig.show() -``` - -## High Dimension Visualization with `px.scatter_matrix` - -If you need to visualize classifications that go beyond 3D, you can use the [scatter plot matrix](https://plot.ly/python/splom/). - -```python -import numpy as np -import plotly.express as px -import plotly.graph_objects as go -from sklearn.neighbors import KNeighborsClassifier -from sklearn.model_selection import train_test_split - -df = px.data.iris() -features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] - -X = df[features] -y = df.species -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) - -# Create classifier, run predictions on grid -clf = KNeighborsClassifier(15, weights='distance') -clf.fit(X_train, y_train) -y_pred = clf.predict(X_test) - -fig = px.scatter_matrix(X_test, dimensions=features, color=y_pred, labels={'color': 'prediction'}) -fig.show() -``` - ### Reference Learn more about `px`, `go.Contour`, and `go.Heatmap` here: * https://plot.ly/python/plotly-express/ * https://plot.ly/python/heatmaps/ * https://plot.ly/python/contour-plots/ -* https://plot.ly/python/3d-scatter-plots/ -* https://plot.ly/python/splom/ This tutorial was inspired by amazing examples from the official scikit-learn docs: * https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html From 99621b08a4c5d2312caed7d39a36ab026f848a8f Mon Sep 17 00:00:00 2001 From: xhlu Date: Fri, 13 Mar 2020 15:25:58 -0400 Subject: [PATCH 18/35] ML Docs, Regression: fix import, update titles, colors --- doc/python/ml-regression.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index e37e1d04e95..6f7f56c51b6 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -47,7 +47,7 @@ import plotly.express as px df = px.data.tips() fig = px.scatter( df, x='total_bill', y='tip', opacity=0.65, - trendline='ols', trendline_color_override='red' + trendline='ols', trendline_color_override='darkblue' ) fig.show() ``` @@ -223,6 +223,7 @@ fig.show() When you are fitting a linear regression, you want to often know what feature matters the most in your regression's output. ```python +import pandas as pd import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LinearRegression @@ -356,14 +357,14 @@ fig = px.scatter( fig.show() ``` -## Visualize regularization across different cross-validation folds +## Visualize regularization across cross-validation folds In this example, we show how to plot the results of various $\alpha$ penalization values from the results of cross-validation using scikit-learn's `LassoCV`. This is useful to see how much the error of the optimal alpha actually varies across CV folds. ```python -import pandas as pd import numpy as np +import pandas as pd import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LassoCV From 0cde621d14a5aeeedffed5cb3b07aa9a7f321428 Mon Sep 17 00:00:00 2001 From: xhlu Date: Fri, 13 Mar 2020 19:35:06 -0400 Subject: [PATCH 19/35] ML Docs: Update all kNN sections based on discussions --- doc/python/ml-knn.md | 141 +++++++++++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 38 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index adac06295f2..d53b18e9f95 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -36,56 +36,79 @@ jupyter: ## Basic binary classification with kNN +This section gets us started with displaying basic binary classification using 2D data. We first show how to display training versus testing data using [various marker styles](https://plot.ly/python/marker-style/), then demonstrate how to evaluate a kNN classifier's performance on the **test split** using a continuous color gradient to indicate the model's predicted score. -### Display training and test splits - -```python - -``` -### Visualize predictions on test split +### Display training and test splits -```python -``` +Here, we display all the negative labels as squares, and positive labels as circles. We differentiate the training and test set by adding a dot to the center of test data. ```python import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.datasets import make_moons +from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier X, y = make_moons(noise=0.3, random_state=0) -X_test, _ = make_moons(noise=0.3, random_state=1) - -clf = KNeighborsClassifier(15) -clf.fit(X, y.astype(str)) # Fit on training set -y_pred = clf.predict(X_test) # Predict on new data - -fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred, labels={'color': 'predicted'}) -fig.update_traces(marker_size=10) +X_train, X_test, y_train, y_test = train_test_split( + X, y.astype(str), test_size=0.25, random_state=0) + +trace_specs = [ + [X_train, y_train, '0', 'Train', 'square'], + [X_train, y_train, '1', 'Train', 'circle'], + [X_test, y_test, '0', 'Test', 'square-dot'], + [X_test, y_test, '1', 'Test', 'circle-dot'] +] + +fig = go.Figure(data=[ + go.Scatter( + x=X[y==label, 0], y=X[y==label, 1], + name=f'{split} Split, Label {label}', + mode='markers', marker_symbol=marker + ) + for X, y, label, split, marker in trace_specs +]) +fig.update_traces( + marker_size=12, marker_line_width=1.5, + marker_color="lightyellow" +) fig.show() ``` -## Visualize Binary Prediction Scores +### Visualize predictions on test split + + +Now, we evaluate the model only on the test set. Notice that `px.scatter` only require 1 function call to plot both negative and positive labels, and can additionally set a continuous color scale based on the `y_score` output by our kNN model. ```python import numpy as np import plotly.express as px import plotly.graph_objects as go -from sklearn.datasets import make_classification +from sklearn.datasets import make_moons +from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier -X, y = make_classification(n_features=2, n_redundant=0, random_state=0) -X_test, _ = make_classification(n_features=2, n_redundant=0, random_state=1) +# Load and split data +X, y = make_moons(noise=0.3, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, y.astype(str), test_size=0.25, random_state=0) +# Fit the model on training data, predict on test data clf = KNeighborsClassifier(15) -clf.fit(X, y) # Fit on training set -y_score = clf.predict_proba(X_test)[:, 1] # Predict on new data - -fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_score, labels={'color': 'score'}) -fig.update_traces(marker_size=10) +clf.fit(X_train, y_train) +y_score = clf.predict_proba(X_test)[:, 1] + +fig = px.scatter( + X_test, x=0, y=1, + color=y_score, color_continuous_scale='RdBu', + symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'}, + labels={'symbol': 'Label', 'color': 'Score'} +) +fig.update_traces(marker_size=12, marker_line_width=1.5) +fig.update_layout(legend_orientation='h') fig.show() ``` @@ -96,12 +119,16 @@ import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.datasets import make_moons +from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier mesh_size = .02 -margin = 1 +margin = 0.25 +# Load and split data X, y = make_moons(noise=0.3, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, y.astype(str), test_size=0.25, random_state=0) # Create a mesh grid on which we will run our model x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin @@ -116,17 +143,36 @@ clf.fit(X, y) Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) -fig = px.scatter(X, x=0, y=1, color=y.astype(str), labels={'0':'', '1':''}) -fig.update_traces(marker_size=10, marker_line_width=1) +trace_specs = [ + [X_train, y_train, '0', 'Train', 'square'], + [X_train, y_train, '1', 'Train', 'circle'], + [X_test, y_test, '0', 'Test', 'square-dot'], + [X_test, y_test, '1', 'Test', 'circle-dot'] +] + +fig = go.Figure(data=[ + go.Scatter( + x=X[y==label, 0], y=X[y==label, 1], + name=f'{split} Split, Label {label}', + mode='markers', marker_symbol=marker + ) + for X, y, label, split, marker in trace_specs +]) +fig.update_traces( + marker_size=12, marker_line_width=1.5, + marker_color="lightyellow" +) + fig.add_trace( go.Contour( x=xrange, y=yrange, z=Z, showscale=False, - colorscale=['Blue', 'Red'], + colorscale='RdBu', opacity=0.4, - name='Score' + name='Score', + hoverinfo='skip' ) ) fig.show() @@ -134,6 +180,8 @@ fig.show() ## Multi-class prediction confidence with `go.Heatmap` +It is also possible to visualize the prediction confidence of the model using `go.Heatmap`. In this example, you can see how to compute how confident the model is about its prediction at every point in the 2D grid. Here, we define the confidence as the difference between the highest score and the score of the other classes summed, at a certain point. + ```python import numpy as np import plotly.express as px @@ -145,8 +193,9 @@ margin = 1 # We will use the iris data, which is included in px df = px.data.iris() -X = df[['sepal_length', 'sepal_width']] -y = df.species_id +df_train, df_test = train_test_split(df, test_size=0.25, random_state=0) +X_train = df_train[['sepal_length', 'sepal_width']] +y_train = df_train.species_id # Create a mesh grid on which we will run our model l_min, l_max = df.sepal_length.min() - margin, df.sepal_length.max() + margin @@ -157,23 +206,35 @@ ll, ww = np.meshgrid(lrange, wrange) # Create classifier, run predictions on grid clf = KNeighborsClassifier(15, weights='distance') -clf.fit(X, y) +clf.fit(X_train, y_train) Z = clf.predict(np.c_[ll.ravel(), ww.ravel()]) Z = Z.reshape(ll.shape) proba = clf.predict_proba(np.c_[ll.ravel(), ww.ravel()]) proba = proba.reshape(ll.shape + (3,)) -fig = px.scatter(df, x='sepal_length', y='sepal_width', color='species') -fig.update_traces(marker_size=10, marker_line_width=1) +# Compute the confidence, which is the difference +diff = proba.max(axis=-1) - (proba.sum(axis=-1) - proba.max(axis=-1)) + +fig = px.scatter( + df_test, x='sepal_length', y='sepal_width', + symbol='species', + symbol_map={ + 'setosa': 'square-dot', + 'versicolor': 'circle-dot', + 'virginica': 'diamond-dot'}, +) +fig.update_traces( + marker_size=12, marker_line_width=1.5, + marker_color="lightyellow" +) fig.add_trace( go.Heatmap( x=lrange, y=wrange, - z=Z, - showscale=False, - colorscale=[[0.0, 'blue'], [0.5, 'red'], [1.0, 'green']], + z=diff, opacity=0.25, customdata=proba, + colorscale='RdBu', hovertemplate=( 'sepal length: %{x}
' 'sepal width: %{y}
' @@ -183,6 +244,10 @@ fig.add_trace( ) ) ) +fig.update_layout( + legend_orientation='h', + title='Prediction Confidence on Test Split' +) fig.show() ``` From 7447304d8ba88e44cdc2d05a77c97212410b59c3 Mon Sep 17 00:00:00 2001 From: xhlu Date: Fri, 13 Mar 2020 22:52:45 -0400 Subject: [PATCH 20/35] ML Docs: Update Regression notebook Added a preliminary section that introduces roc curves --- doc/python/ml-roc-pr.md | 77 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md index 8c1bc6becb4..44f9e53ff48 100644 --- a/doc/python/ml-roc-pr.md +++ b/doc/python/ml-roc-pr.md @@ -34,7 +34,58 @@ jupyter: thumbnail: thumbnail/ml-roc-pr.png --- -## Basic Binary ROC Curve +## Preliminary plots + +Before diving into the receiver operating characteristic (ROC) curve, we will look at two plots that will give some context to the thresholds mechanism behind the ROC and PR curves. + +In the histogram, we observe that the score spread such that most of the positive labels are binned near 1, and a lot of the negative labels are close to 0. When we set a threshold on the score, all of the bins to its left will be classified as 0's, and everything to the right will be 1's. There are obviously a few outliers, such as **negative** samples that our model gave a high score, and *positive* samples with a low score. If we set a threshold right in the middle, those outliers will respectively become **false positives** and *false negatives*. + +As we adjust thresholds, the number of positive positives will increase or decrease, and at the same time the number of true positives will also change; this is shown in the second plot. As you can see, the model seems to perform fairly well, because the true positive rate decreases slowly, whereas the false positive rate decreases sharply as we increase the threshold. Those two lines each represent a dimension of the ROC curve. + +```python +import plotly.express as px +import plotly.graph_objects as go +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import make_classification + +X, y = make_classification(n_samples=500, random_state=0) + +model = LogisticRegression() +model.fit(X, y) +y_score = model.predict_proba(X)[:, 1] +fpr, tpr, thresholds = roc_curve(y, y_score) + +# The histogram of scores compared to true labels +fig_hist = px.histogram( + x=y_score, color=y, nbins=50, + labels=dict(color='True Labels', x='Score') +) + +# Evaluating model performance at various thresholds +fig_thresh = go.Figure([ + go.Scatter(x=thresholds, y=fpr, name='False Positive Rate'), + go.Scatter(x=thresholds, y=tpr, name='True Positive Rate') +]) +fig_thresh.update_layout( + title='TPR and FPR at every threshold', + xaxis_title='Threshold', + yaxis_title='Rate', + yaxis=dict(scaleanchor="x", scaleratio=1), + xaxis=dict(constrain='domain') +) +fig_thresh.update_xaxes(range=[0, 1]) + +# Display plots +fig_hist.show() +fig_thresh.show() +``` + +## Basic binary ROC curve + +Notice how this ROC curve looks similar to the True Positive Rate curve from the previous plot. This is because they are the same curve, except the x-axis consists of increasing values of FPR instead of threshold, which is why the line is flipped and distorted. + +We also display the area under the ROC curve (ROC AUC), which is fairly high, thus consistent with our intepretation of the previous plots. ```python import plotly.express as px @@ -59,6 +110,10 @@ fig.add_shape( type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1 ) +fig.update_layout( + yaxis=dict(scaleanchor="x", scaleratio=1), + xaxis=dict(constrain='domain') +) fig.show() ``` @@ -112,7 +167,9 @@ for i in range(y_scores.shape[1]): fig.update_layout( xaxis_title='False Positive Rate', - yaxis_title='True Positive Rate' + yaxis_title='True Positive Rate', + yaxis=dict(scaleanchor="x", scaleratio=1), + xaxis=dict(constrain='domain') ) fig.show() ``` @@ -144,6 +201,11 @@ fig.add_shape( type='line', line=dict(dash='dash'), x0=0, x1=1, y0=1, y1=0 ) +fig.update_layout( + yaxis=dict(scaleanchor="x", scaleratio=1), + xaxis=dict(constrain='domain') +) + fig.show() ``` @@ -195,7 +257,16 @@ for i in range(y_scores.shape[1]): fig.update_layout( xaxis_title='Recall', - yaxis_title='Precision' + yaxis_title='Precision', + yaxis=dict(scaleanchor="x", scaleratio=1), + xaxis=dict(constrain='domain') ) fig.show() ``` + +## References + +Learn more about `px`, `px.area`, `px.hist`: +* https://plot.ly/python/histograms/ +* https://plot.ly/python/filled-area-plots/ +* https://plot.ly/python/line-charts/ From 7ab73cb4ff1a87b4e880bae800d5878e1fdc2458 Mon Sep 17 00:00:00 2001 From: xhlu Date: Fri, 13 Mar 2020 22:52:59 -0400 Subject: [PATCH 21/35] ML Docs: Updated PCA notebook Added loadings, moved high-dimensional analysis first --- doc/python/ml-pca.md | 165 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 138 insertions(+), 27 deletions(-) diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md index 105edd8af66..6fa3db760ef 100644 --- a/doc/python/ml-pca.md +++ b/doc/python/ml-pca.md @@ -34,73 +34,88 @@ jupyter: thumbnail: thumbnail/ml-pca.png --- -## Basic PCA Scatter Plot +## High-dimensional PCA Analysis with `px.scatter_matrix` -This example shows you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. It uses scikit-learn's `PCA`. + +### Visualize all the original dimensions + +First, let's plot all the features and see how the `species` in the Iris dataset are grouped. In a [splom](https://plot.ly/python/splom/), each subplot displays a feature against another, so if we have $N$ features we have a $N \times N$ matrix. + +In our example, we are plotting all 4 features from the Iris dataset, thus we can see how `sepal_width` is compared against `sepal_length`, then against `petal_width`, and so forth. Keep in mind how some pairs of features can more easily separate different species. ```python import plotly.express as px -from sklearn.decomposition import PCA df = px.data.iris() -X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] +features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] -pca = PCA(n_components=2) -components = pca.fit_transform(X) - -fig = px.scatter(x=components[:, 0], y=components[:, 1], color=df['species']) +fig = px.scatter_matrix( + df, + dimensions=features, + color="species" +) +fig.update_traces(diagonal_visible=False) fig.show() ``` -## Visualize PCA with `px.scatter_3d` +### Visualize all the principal components + +Now, we apply `PCA` the same dataset, and retrieve **all** the components. We use the same `px.scatter_matrix` trace to display our results, but this time our features are the resulting *principal components*, ordered by how much variance they are able to explain. -Just like the basic PCA plot, this let you visualize the first 3 dimensions. This additionally displays the total variance explained by those components. +The importance of explained variance is demonstrated in the example below. The subplot between PC3 and PC4 is clearly unable to separate each class, whereas the subplot between PC1 and PC2 shows a clear separation between each species. ```python import plotly.express as px from sklearn.decomposition import PCA df = px.data.iris() -X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] - -pca = PCA(n_components=3) -components = pca.fit_transform(X) +features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] -total_var = pca.explained_variance_ratio_.sum() * 100 +pca = PCA() +components = pca.fit_transform(df[features]) +labels = { + str(i): f"PC {i+1} ({var:.1f}%)" + for i, var in enumerate(pca.explained_variance_ratio_ * 100) +} -fig = px.scatter_3d( - x=components[:, 0], y=components[:, 1], z=components[:, 2], - color=df['species'], - title=f'Total Explained Variance: {total_var:.2f}%', - labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, +fig = px.scatter_matrix( + components, + labels=labels, + dimensions=range(4), + color=df["species"] ) +fig.update_traces(diagonal_visible=False) fig.show() ``` -## Plot high-dimensional components with `px.scatter_matrix` +### Visualize a subset of the principal components + +When you will have too many features to visualize, you might be interested in only visualizing the most relevant components. Those components often capture a majority of the [explained variance](https://en.wikipedia.org/wiki/Explained_variation), which is a good way to tell if those components are sufficient for modelling this dataset. -If you need to visualize more than 3 dimensions, you can use scatter plot matrices. +In the example below, our dataset contains 10 features, but we only select the first 4 components, since they explain over 99% of the total variance. ```python import pandas as pd +import plotly.express as px from sklearn.decomposition import PCA from sklearn.datasets import load_boston boston = load_boston() df = pd.DataFrame(boston.data, columns=boston.feature_names) +n_components = 4 -pca = PCA(n_components=5) +pca = PCA(n_components=n_components) components = pca.fit_transform(df) total_var = pca.explained_variance_ratio_.sum() * 100 -labels = {str(i): f"PC {i+1}" for i in range(5)} +labels = {str(i): f"PC {i+1}" for i in range(n_components)} labels['color'] = 'Median Price' fig = px.scatter_matrix( components, color=boston.target, - dimensions=range(5), + dimensions=range(n_components), labels=labels, title=f'Total Explained Variance: {total_var:.2f}%', ) @@ -108,13 +123,56 @@ fig.update_traces(diagonal_visible=False) fig.show() ``` +## 2D PCA Scatter Plot + +In the previous examples, you saw how to visualize high-dimensional PCs. In this example, we show you how to simply visualize the first two principal components of a PCA, by reducing a dataset of 4 dimensions to 2D. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=2) +components = pca.fit_transform(X) + +fig = px.scatter(components, x=0, y=1, color=df['species']) +fig.show() +``` + +## Visualize PCA with `px.scatter_3d` + +With `px.scatter_3d`, you can visualize an additional dimension, which let you capture even more variance. + +```python +import plotly.express as px +from sklearn.decomposition import PCA + +df = px.data.iris() +X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + +pca = PCA(n_components=3) +components = pca.fit_transform(X) + +total_var = pca.explained_variance_ratio_.sum() * 100 + +fig = px.scatter_3d( + components, x=0, y=1, z=2, color=df['species'], + title=f'Total Explained Variance: {total_var:.2f}%', + labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'} +) +fig.show() +``` + ## Plotting explained variance -Often, you might be interested in seeing how much variance the PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). +Often, you might be interested in seeing how much variance PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). ```python import numpy as np import pandas as pd +import plotly.express as px from sklearn.decomposition import PCA from sklearn.datasets import load_diabetes @@ -132,4 +190,57 @@ px.area( ) ``` -## Visualize loadings +## Visualize Loadings + +It is also possible to visualize loadings using `shapes`, and use `annotations` to indicate which feature a certain loading original belong to. Here, we define loadings as: + +$$ +loadings = eigenvectors \cdot \sqrt{eigenvalues} +$$ + +```python +import plotly.express as px +from sklearn.decomposition import PCA +from sklearn import datasets +from sklearn.preprocessing import StandardScaler + +df = px.data.iris() +features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] +X = df[features] + +pca = PCA(n_components=2) +components = pca.fit_transform(X) + +loadings = pca.components_.T * np.sqrt(pca.explained_variance_) + +fig = px.scatter(components, x=0, y=1, color=df['species']) + +for i, feature in enumerate(features): + fig.add_shape( + type='line', + x0=0, y0=0, + x1=loadings[i, 0], + y1=loadings[i, 1] + ) + fig.add_annotation( + x=loadings[i, 0], + y=loadings[i, 1], + ax=0, ay=0, + xanchor="center", + yanchor="bottom", + text=feature, + ) +fig.show() +``` + +## References + +Learn more about `px`, `px.scatter_3d`, and `px.scatter_matrix` here: +* https://plot.ly/python/plotly-express/ +* https://plot.ly/python/3d-scatter-plots/ +* https://plot.ly/python/splom/ + +The following resources offer an in-depth overview of PCA and explained variance: +* https://en.wikipedia.org/wiki/Explained_variation +* https://scikit-learn.org/stable/modules/decomposition.html#pca +* https://stats.stackexchange.com/questions/2691/making-sense-of-principal-component-analysis-eigenvectors-eigenvalues/140579#140579 From 0e8b5d64daecff24ee7c82ee6090ac508b495ce7 Mon Sep 17 00:00:00 2001 From: xhlu Date: Tue, 17 Mar 2020 12:59:57 -0400 Subject: [PATCH 22/35] ML Docs: Update knn and regression based on Emma's reviews --- doc/python/ml-knn.md | 2 +- doc/python/ml-regression.md | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index d53b18e9f95..74b830a3b3f 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -105,7 +105,7 @@ fig = px.scatter( X_test, x=0, y=1, color=y_score, color_continuous_scale='RdBu', symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'}, - labels={'symbol': 'Label', 'color': 'Score'} + labels={'symbol': 'label', 'color': 'score of
first class'} ) fig.update_traces(marker_size=12, marker_line_width=1.5) fig.update_layout(legend_orientation='h') diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 6f7f56c51b6..d1945f742b3 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -39,7 +39,7 @@ jupyter: ### Ordinary Least Square (OLS) with `plotly.express` -This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips servers will receive based on the value of the total bill. +This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips waiters will receive based on the value of the total bill. ```python import plotly.express as px @@ -88,7 +88,7 @@ from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split df = px.data.tips() -X = df.total_bill.values.reshape(-1, 1) +X = df.total_bill[:, None] X_train, X_test, y_train, y_test = train_test_split(X, df.tip, random_state=0) model = LinearRegression() @@ -162,8 +162,8 @@ X = df.total_bill.values.reshape(-1, 1) x_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65) -for n_features in [1, 2, 3, 4]: - poly = PolynomialFeatures(n_features) +for degree in [1, 2, 3, 4]: + poly = PolynomialFeatures(degree) poly.fit(X) X_poly = poly.transform(X) x_range_poly = poly.transform(x_range) @@ -180,13 +180,13 @@ fig.show() ## 3D regression surface with `px.scatter_3d` and `go.Surface` -Visualize the decision plane of your model whenever you have more than one variable in your input data. +Visualize the decision plane of your model whenever you have more than one variable in your input data. Here, we will use [`sklearn.svm.SVR`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html), which is a Support Vector Machine (SVM) model specifically designed for regression. ```python import numpy as np import plotly.express as px import plotly.graph_objects as go -from sklearn.neighbors import KNeighborsRegressor +from sklearn.svm import SVR mesh_size = .02 margin = 0 @@ -197,8 +197,8 @@ X = df[['sepal_width', 'sepal_length']] y = df['petal_width'] # Condition the model on sepal width and length, predict the petal width -knn = KNeighborsRegressor(10, weights='distance') -knn.fit(X, y) +model = SVR(C=1.) +model.fit(X, y) # Create a mesh grid on which we will run our model x_min, x_max = X.sepal_width.min() - margin, X.sepal_width.max() + margin @@ -207,8 +207,8 @@ xrange = np.arange(x_min, x_max, mesh_size) yrange = np.arange(y_min, y_max, mesh_size) xx, yy = np.meshgrid(xrange, yrange) -# Run kNN -pred = knn.predict(np.c_[xx.ravel(), yy.ravel()]) +# Run model +pred = model.predict(np.c_[xx.ravel(), yy.ravel()]) pred = pred.reshape(xx.shape) # Generate the plot @@ -271,7 +271,7 @@ model = LinearRegression() model.fit(X, y) y_pred = model.predict(X) -fig = px.scatter(x=y_pred, y=y, labels={'x': 'prediction', 'y': 'actual'}) +fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'}) fig.add_shape( type="line", line=dict(dash='dash'), x0=y.min(), y0=y.min(), @@ -308,10 +308,11 @@ model.fit(X_train, y_train) df['prediction'] = model.predict(X) fig = px.scatter( - df, x='prediction', y='petal_width', + df, x='petal_width', y='prediction', marginal_x='histogram', marginal_y='histogram', color='split', trendline='ols' ) +fig.update_traces(histnorm='probability', selector={'type':'histogram'}) fig.add_shape( type="line", line=dict(dash='dash'), x0=y.min(), y0=y.min(), From 3bb49a3d960c05b964ce9298f23c3b1959b0d163 Mon Sep 17 00:00:00 2001 From: xhlu Date: Tue, 17 Mar 2020 15:11:36 -0400 Subject: [PATCH 23/35] ML Docs: Update header description --- doc/python/ml-knn.md | 2 +- doc/python/ml-pca.md | 2 +- doc/python/ml-roc-pr.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 74b830a3b3f..fd5e87668e9 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -23,7 +23,7 @@ jupyter: version: 3.7.6 plotly: description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification - with Plotly + in Python with Plotly. display_as: ai_ml language: python layout: base diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md index 6fa3db760ef..41cab7a64ff 100644 --- a/doc/python/ml-pca.md +++ b/doc/python/ml-pca.md @@ -23,7 +23,7 @@ jupyter: version: 3.7.6 plotly: description: Visualize Principle Component Analysis (PCA) of your high-dimensional - data with Plotly on Python. + data in Python with Plotly. display_as: ai_ml language: python layout: base diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md index 44f9e53ff48..ad30cd951b3 100644 --- a/doc/python/ml-roc-pr.md +++ b/doc/python/ml-roc-pr.md @@ -23,7 +23,7 @@ jupyter: version: 3.7.6 plotly: description: Interpret the results of your classification using Receiver Operating - Characteristics (ROC) and Precision-Recall (PR) Curves using Plotly on Python. + Characteristics (ROC) and Precision-Recall (PR) Curves in Python with Plotly. display_as: ai_ml language: python layout: base From 895231f9bf56d95495339c634ebaae7a66099af1 Mon Sep 17 00:00:00 2001 From: xhlu Date: Tue, 17 Mar 2020 16:30:58 -0400 Subject: [PATCH 24/35] ML Docs: Add t-SNE/UMAP notebook (read todo) TODO: Add thumbnail, references, description of sections --- doc/python/tsne-umap-projections.md | 149 ++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 doc/python/tsne-umap-projections.md diff --git a/doc/python/tsne-umap-projections.md b/doc/python/tsne-umap-projections.md new file mode 100644 index 00000000000..a2a2aa78604 --- /dev/null +++ b/doc/python/tsne-umap-projections.md @@ -0,0 +1,149 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.6 + plotly: + description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification + in Python with Plotly. + display_as: ai_ml + language: python + layout: base + name: t-SNE and UMAP projections + order: 1 + page_type: example_index + permalink: python/t-sne-and-umap-projections/ + thumbnail: thumbnail/tsne-umap-projections.png +--- + +## Basic t-SNE projections + + +### Visualizing high-dimensional data with `px.scatter_matrix` + +```python +import plotly.express as px + +df = px.data.iris() +features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] +fig = px.scatter_matrix(df, dimensions=features, color="species") +fig.show() +``` + +### Project data into 2D with t-SNE and `px.scatter` + +```python +from sklearn.manifold import TSNE +import plotly.express as px + +df = px.data.iris() + +features = df.loc[:, :'petal_width'] + +tsne = TSNE(n_components=2, random_state=0) +projections = tsne.fit_transform(features) + +fig = px.scatter( + projections, x=0, y=1, + color=df.species, labels={'color': 'species'} +) +fig.show() +``` + +### Project data into 3D with t-SNE and `px.scatter_3d` + +```python +from sklearn.manifold import TSNE +import plotly.express as px + +df = px.data.iris() + +features = df.loc[:, :'petal_width'] + +tsne = TSNE(n_components=3, random_state=0) +projections = tsne.fit_transform(features, ) + +fig = px.scatter_3d( + projections, x=0, y=1, z=2, + color=df.species, labels={'color': 'species'} +) +fig.update_traces(marker_size=8) +fig.show() +``` + +## Projections with UMAP + +Just like t-SNE, [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html) is a dimensionality reduction specifically designed for visualizing complex data in low dimensions (2D or 3D). As the number of data points increase, [UMAP becomes more time efficient](https://umap-learn.readthedocs.io/en/latest/benchmarking.html) compared to TSNE. + +In the example below, we see how easy it is to use UMAP as a drop-in replacement for scikit-learn's `manifold.TSNE`. + +```python +from umap import UMAP +import plotly.express as px + +df = px.data.iris() + +features = df.loc[:, :'petal_width'] + +umap_2d = UMAP(n_components=2, init='random', random_state=0) +umap_3d = UMAP(n_components=3, init='random', random_state=0) + +proj_2d = umap_2d.fit_transform(features) +proj_3d = umap_3d.fit_transform(features) + +fig_2d = px.scatter( + proj_2d, x=0, y=1, + color=df.species, labels={'color': 'species'} +) +fig_3d = px.scatter_3d( + proj_3d, x=0, y=1, z=2, + color=df.species, labels={'color': 'species'} +) +fig_3d.update_traces(marker_size=5) + +fig_2d.show() +fig_3d.show() +``` + +## Visualizing image datasets + +In the following example, we show how to visualize large image datasets using UMAP. Here, we use [`load_digits`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html), a subset of the famous MNIST dataset that was downsized to 8x8 and flattened to 64 dimensions. + +```python +import plotly.express as px +from sklearn.datasets import load_digits +from umap import UMAP + +digits = load_digits() + +umap_2d = UMAP(random_state=0) +umap_2d.fit(digits.data) + +projections = umap_2d.transform(digits.data) + +fig = px.scatter( + projections, x=0, y=1, + color=digits.target.astype(str), labels={'color': 'digit'} +) +fig.show() +``` + +### Reference From 802d1ef15a2cd0f4c3f3636a97a6659c52a39bf4 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 13:38:36 -0400 Subject: [PATCH 25/35] ML Docs: More explanations for the KNN section --- doc/python/ml-knn.md | 73 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index fd5e87668e9..e22be45f43a 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -20,7 +20,7 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.7.6 + version: 3.7.7 plotly: description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification in Python with Plotly. @@ -36,13 +36,19 @@ jupyter: ## Basic binary classification with kNN -This section gets us started with displaying basic binary classification using 2D data. We first show how to display training versus testing data using [various marker styles](https://plot.ly/python/marker-style/), then demonstrate how to evaluate a kNN classifier's performance on the **test split** using a continuous color gradient to indicate the model's predicted score. +This section gets us started with displaying basic binary classification using 2D data. We first show how to display training versus testing data using [various marker styles](https://plot.ly/python/marker-style/), then demonstrate how to evaluate our classifier's performance on the **test split** using a continuous color gradient to indicate the model's predicted score. + +We will use [Scikit-learn](https://scikit-learn.org/) for training our model and for loading and splitting data. Scikit-learn is a popular Machine Learning (ML) library that offers various tools for creating and training ML algorithms, feature engineering, data cleaning, and evaluating and testing models. It was designed to be accessible, and to work seamlessly with popular libraries like NumPy and Pandas. + +We will train a [k-Nearest Neighbors (kNN)](https://scikit-learn.org/stable/modules/neighbors.html) classifier. First, the model records the label of each training sample. Then, whenever we give it a new sample, it will look at the `k` closest samples from the training set to find the most common label, and assign it to our new sample. ### Display training and test splits -Here, we display all the negative labels as squares, and positive labels as circles. We differentiate the training and test set by adding a dot to the center of test data. +Using Scikit-learn, we first generate synthetic data that form the shape of a moon. We then split it into a training and testing set. Finally, we display the ground truth labels using [a scatter plot](https://plotly.com/python/line-and-scatter/). + +In the graph, we display all the negative labels as squares, and positive labels as circles. We differentiate the training and test set by adding a dot to the center of test data. ```python import numpy as np @@ -52,6 +58,7 @@ from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier +# Load and split data X, y = make_moons(noise=0.3, random_state=0) X_train, X_test, y_train, y_test = train_test_split( X, y.astype(str), test_size=0.25, random_state=0) @@ -78,10 +85,12 @@ fig.update_traces( fig.show() ``` -### Visualize predictions on test split +### Visualize predictions on test split with [`plotly.express`](https://plotly.com/python/plotly-express/) + +Now, we train the kNN model on the same training data displayed in the previous graph. Then, we predict the confidence score of the model for each of the data points in the test set. We will use shapes to denote the true labels, and the color will indicate the confidence of the model for assign that score. -Now, we evaluate the model only on the test set. Notice that `px.scatter` only require 1 function call to plot both negative and positive labels, and can additionally set a continuous color scale based on the `y_score` output by our kNN model. +Notice that `px.scatter` only require 1 function call to plot both negative and positive labels, and can additionally set a continuous color scale based on the `y_score` output by our kNN model. ```python import numpy as np @@ -114,6 +123,56 @@ fig.show() ## Probability Estimates with `go.Contour` +Just like the previous example, we will first train our kNN model on the training set. + +Instead of predicting the conference for the test set, we can predict the confidence map for the entire area that wraps around the dimensions of our dataset. To do this, we use [`np.meshgrid`](https://numpy.org/doc/stable/reference/generated/numpy.meshgrid.html) to create a grid, where the distance between each point is denoted by the `mesh_size` variable. + +Then, for each of those points, we will use our model to give a confidence score, and plot it with a [contour plot](https://plotly.com/python/contour-plots/). + +```python +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +from sklearn.datasets import make_moons +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier + +mesh_size = .02 +margin = 0.25 + +# Load and split data +X, y = make_moons(noise=0.3, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, y.astype(str), test_size=0.25, random_state=0) + +# Create a mesh grid on which we will run our model +x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin +y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin +xrange = np.arange(x_min, x_max, mesh_size) +yrange = np.arange(y_min, y_max, mesh_size) +xx, yy = np.meshgrid(xrange, yrange) + +# Create classifier, run predictions on grid +clf = KNeighborsClassifier(15, weights='uniform') +clf.fit(X, y) +Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] +Z = Z.reshape(xx.shape) + + +# Plot the figure +fig = go.Figure(data=[ + go.Contour( + x=xrange, + y=yrange, + z=Z, + colorscale='RdBu' + ) +]) +fig.show() +``` + +Now, let's try to combine our `go.Contour` plot with the first scatter plot of our data points, so that we can visually compare the confidence of our model with the true labels. + ```python import numpy as np import plotly.express as px @@ -178,9 +237,9 @@ fig.add_trace( fig.show() ``` -## Multi-class prediction confidence with `go.Heatmap` +## Multi-class prediction confidence with [`go.Heatmap`](https://plotly.com/python/heatmaps/) -It is also possible to visualize the prediction confidence of the model using `go.Heatmap`. In this example, you can see how to compute how confident the model is about its prediction at every point in the 2D grid. Here, we define the confidence as the difference between the highest score and the score of the other classes summed, at a certain point. +It is also possible to visualize the prediction confidence of the model using [heatmaps](https://plotly.com/python/heatmaps/). In this example, you can see how to compute how confident the model is about its prediction at every point in the 2D grid. Here, we define the confidence as the difference between the highest score and the score of the other classes summed, at a certain point. ```python import numpy as np From 5cda611bfe6ca3757a0864594c1f0f6afb533591 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 16:04:41 -0400 Subject: [PATCH 26/35] Rename Tsne tutorial --- .../{tsne-umap-projections.md => ml-tsne-umap-projections.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename doc/python/{tsne-umap-projections.md => ml-tsne-umap-projections.md} (99%) diff --git a/doc/python/tsne-umap-projections.md b/doc/python/ml-tsne-umap-projections.md similarity index 99% rename from doc/python/tsne-umap-projections.md rename to doc/python/ml-tsne-umap-projections.md index a2a2aa78604..f43f2e18d14 100644 --- a/doc/python/tsne-umap-projections.md +++ b/doc/python/ml-tsne-umap-projections.md @@ -20,7 +20,7 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.7.6 + version: 3.7.7 plotly: description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification in Python with Plotly. From 46d93de46bf8163dc4c4e5d201c2787909338a3f Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 16:04:59 -0400 Subject: [PATCH 27/35] Update kNN page --- doc/python/ml-knn.md | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index e22be45f43a..6c86de79240 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -45,7 +45,6 @@ We will train a [k-Nearest Neighbors (kNN)](https://scikit-learn.org/stable/modu ### Display training and test splits - Using Scikit-learn, we first generate synthetic data that form the shape of a moon. We then split it into a training and testing set. Finally, we display the ground truth labels using [a scatter plot](https://plotly.com/python/line-and-scatter/). In the graph, we display all the negative labels as squares, and positive labels as circles. We differentiate the training and test set by adding a dot to the center of test data. From 38ef59d28d9300b4545eef7931c8363e926423fa Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 16:05:07 -0400 Subject: [PATCH 28/35] ML Docs: Update PCA page --- doc/python/ml-pca.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md index 41cab7a64ff..5395fd3dc04 100644 --- a/doc/python/ml-pca.md +++ b/doc/python/ml-pca.md @@ -20,7 +20,7 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.7.6 + version: 3.7.7 plotly: description: Visualize Principle Component Analysis (PCA) of your high-dimensional data in Python with Plotly. @@ -34,12 +34,21 @@ jupyter: thumbnail: thumbnail/ml-pca.png --- +This page first shows how to visualize higher dimension data using various Plotly figures combined with dimensionality reduction (aka projection). Then, we dive into the specific details of our projection algorithm. + +We will use [Scikit-learn](https://scikit-learn.org/) to load one of the datasets, and apply dimensionality reduction. Scikit-learn is a popular Machine Learning (ML) library that offers various tools for creating and training ML algorithms, feature engineering, data cleaning, and evaluating and testing models. It was designed to be accessible, and to work seamlessly with popular libraries like NumPy and Pandas. + + ## High-dimensional PCA Analysis with `px.scatter_matrix` +The dimensionality reduction technique we will be using is called the [Principal Component Analysis (PCA)](https://scikit-learn.org/stable/modules/decomposition.html#pca). It is a powerful technique that arises from linear algebra and probability theory. In essense, it computes a matrix that represents the variation of your data ([covariance matrix/eigenvectors][covmatrix]), and rank them by their relevance (explained variance/eigenvalues). For a video tutorial, see [this segment on PCA](https://youtu.be/rng04VJxUt4?t=98) from the Coursera ML course. + +[covmatrix]: https://stats.stackexchange.com/questions/2691/making-sense-of-principal-component-analysis-eigenvectors-eigenvalues#:~:text=As%20it%20is%20a%20square%20symmetric%20matrix%2C%20it%20can%20be%20diagonalized%20by%20choosing%20a%20new%20orthogonal%20coordinate%20system%2C%20given%20by%20its%20eigenvectors%20(incidentally%2C%20this%20is%20called%20spectral%20theorem)%3B%20corresponding%20eigenvalues%20will%20then%20be%20located%20on%20the%20diagonal.%20In%20this%20new%20coordinate%20system%2C%20the%20covariance%20matrix%20is%20diagonal%20and%20looks%20like%20that%3A + ### Visualize all the original dimensions -First, let's plot all the features and see how the `species` in the Iris dataset are grouped. In a [splom](https://plot.ly/python/splom/), each subplot displays a feature against another, so if we have $N$ features we have a $N \times N$ matrix. +First, let's plot all the features and see how the `species` in the Iris dataset are grouped. In a [Scatter Plot Matrix (splom)](https://plot.ly/python/splom/), each subplot displays a feature against another, so if we have $N$ features we have a $N \times N$ matrix. In our example, we are plotting all 4 features from the Iris dataset, thus we can see how `sepal_width` is compared against `sepal_length`, then against `petal_width`, and so forth. Keep in mind how some pairs of features can more easily separate different species. @@ -169,6 +178,8 @@ fig.show() Often, you might be interested in seeing how much variance PCA is able to explain as you increase the number of components, in order to decide how many dimensions to ultimately keep or analyze. This example shows you how to quickly plot the cumulative sum of explained variance for a high-dimensional dataset like [Diabetes](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). +With a higher explained variance, you are able to capture more variability in your dataset, which could potentially lead to better performance when training your model. For a more mathematical explanation, see this [Q&A thread](https://stats.stackexchange.com/questions/22569/pca-and-proportion-of-variance-explained). + ```python import numpy as np import pandas as pd @@ -198,6 +209,8 @@ $$ loadings = eigenvectors \cdot \sqrt{eigenvalues} $$ +For more details about the linear algebra behind eigenvectors and loadings, see this [Q&A thread](https://stats.stackexchange.com/questions/143905/loadings-vs-eigenvectors-in-pca-when-to-use-one-or-another). + ```python import plotly.express as px from sklearn.decomposition import PCA @@ -244,3 +257,5 @@ The following resources offer an in-depth overview of PCA and explained variance * https://en.wikipedia.org/wiki/Explained_variation * https://scikit-learn.org/stable/modules/decomposition.html#pca * https://stats.stackexchange.com/questions/2691/making-sense-of-principal-component-analysis-eigenvectors-eigenvalues/140579#140579 +* https://stats.stackexchange.com/questions/143905/loadings-vs-eigenvectors-in-pca-when-to-use-one-or-another +* https://stats.stackexchange.com/questions/22569/pca-and-proportion-of-variance-explained From a954d0d3631374b1c1ad520c577221af0a526111 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 16:05:13 -0400 Subject: [PATCH 29/35] ML Docs: Update regression page --- doc/python/ml-regression.md | 38 +++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index d1945f742b3..2dd99e0b067 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -20,7 +20,7 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.7.6 + version: 3.7.7 plotly: description: Visualize regression in scikit-learn with Plotly. display_as: ai_ml @@ -33,14 +33,29 @@ jupyter: thumbnail: thumbnail/ml-regression.png --- + +This page shows how to use Plotly charts for displaying various types of regression models, starting from simple models like [Linear Regression](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html), and progressively move towards models like [Decision Tree][tree] and [Polynomial Features][poly]. We highlight various capabilities of plotly, such as comparative analysis of the same model with different parameters, displaying Latex, [surface plots](https://plotly.com/python/3d-surface-plots/) for 3D data, and enhanced prediction error analysis with [Plotly Express](https://plotly.com/python/plotly-express/). + +We will use [Scikit-learn](https://scikit-learn.org/) to split and preprocess our data and train various regression models. Scikit-learn is a popular Machine Learning (ML) library that offers various tools for creating and training ML algorithms, feature engineering, data cleaning, and evaluating and testing models. It was designed to be accessible, and to work seamlessly with popular libraries like NumPy and Pandas. + + +[lasso]: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html +[tree]: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html +[poly]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html + + ## Basic linear regression plots +In this section, we show you how to apply a simple regression model for predicting tips a server will receive based on various client attributes (such as sex, time of the week, and whether they are a smoker). -### Ordinary Least Square (OLS) with `plotly.express` +We will be using the [Linear Regression][lr], which is a simple model that fit an intercept (the mean tip received by a server), and add a slope for each feature we use, such as the value of the total bill. We show you how to do that with both Plotly Express and Scikit-learn. +### Ordinary Least Square (OLS) with `plotly.express` This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips waiters will receive based on the value of the total bill. +[lr]: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html + ```python import plotly.express as px @@ -78,7 +93,7 @@ fig.show() ## Model generalization on unseen data -Easily color your plot based on a predefined data split. +With `go.Scatter`, you can easily color your plot based on a predefined data split. By coloring the training and the testing data points with different colors, you can easily see if whether the model generalizes well to the test data or not. ```python import numpy as np @@ -108,7 +123,11 @@ fig.show() ## Comparing different kNN models parameters -Compare the performance of two different models on the same dataset. This can be easily combined with discrete color legends from `px`, such as coloring by the assigned `sex`. +In addition to linear regression, it's possible to fit the same data using [k-Nearest Neighbors][knn]. When you perform a prediction on a new sample, this model either takes the weighted or un-weighted average of the neighbors. In order to see the difference between those two averaging options, we train a kNN model with both of those parameters, and we plot them in the same way as the previous graph. + +Notice how we can combine scatter points with lines using Plotly.py. You can learn more about [multiple chart types](https://plotly.com/python/graphing-multiple-chart-types/). + +[knn]: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html ```python import numpy as np @@ -136,9 +155,14 @@ fig.add_traces(go.Scatter(x=x_range, y=y_dist, name='Weights: Distance')) fig.show() ``` + ## Displaying `PolynomialFeatures` using $\LaTeX$ -It's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. +Notice how linear regression fits a straight line, but kNN can take non-linear shapes. Moreover, it is possible to extend linear regression to polynomial regression by using scikit-learn's `PolynomialFeatures`, which lets you fit a slope for your features raised to the power of `n`, where `n=1,2,3,4` in our example. + + +With Plotly, it's easy to diplay latex equations in legend and titles by simply adding `$` before and after your equation. This way, you can see the coefficients that our polynomial regression fitted. + ```python import numpy as np @@ -220,7 +244,9 @@ fig.show() ## Visualizing coefficients for multiple linear regression (MLR) -When you are fitting a linear regression, you want to often know what feature matters the most in your regression's output. +Visualizing regression with one or two variables is straightforward, since we can respectively plot them with scatter plots and 3D scatter plots. Moreover, if you have more than 2 features, you will need to find alternative ways to visualize your data. + +One way is to use [bar charts](https://plotly.com/python/bar-charts/). In our example, each bar indicates the coefficients of our linear regression model for each input feature. Our model was trained on the [Iris dataset](https://archive.ics.uci.edu/ml/datasets/iris). ```python import pandas as pd From 2152601d7cb79dedd39417b0e7c62c590aa0d794 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 16:05:21 -0400 Subject: [PATCH 30/35] ML Docs: Update ROC/PR Page --- doc/python/ml-roc-pr.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md index ad30cd951b3..21abf071880 100644 --- a/doc/python/ml-roc-pr.md +++ b/doc/python/ml-roc-pr.md @@ -20,7 +20,7 @@ jupyter: name: python nbconvert_exporter: python pygments_lexer: ipython3 - version: 3.7.6 + version: 3.7.7 plotly: description: Interpret the results of your classification using Receiver Operating Characteristics (ROC) and Precision-Recall (PR) Curves in Python with Plotly. From 209dfea292d35cb09a9bcb89d7756426c181733e Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 12 Aug 2020 19:38:37 -0400 Subject: [PATCH 31/35] ML Docs: Update T-sne and UMAP section --- doc/python/ml-tsne-umap-projections.md | 35 ++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/doc/python/ml-tsne-umap-projections.md b/doc/python/ml-tsne-umap-projections.md index f43f2e18d14..db0ef55b5d0 100644 --- a/doc/python/ml-tsne-umap-projections.md +++ b/doc/python/ml-tsne-umap-projections.md @@ -34,11 +34,22 @@ jupyter: thumbnail: thumbnail/tsne-umap-projections.png --- +This page presents various ways to visualize two popular dimensionality reduction techniques, namely the [t-distributed stochastic neighbor embedding](https://lvdmaaten.github.io/tsne/) (t-SNE) and [Uniform Manifold Approximation and Projection](https://umap-learn.readthedocs.io/en/latest/index.html) (UMAP). They are needed whenever you want to visualize data with more than two or three features (i.e. dimensions). + +We first show how to visualize data with more than three features using the [scatter plot matrix](https://medium.com/plotly/what-is-a-splom-chart-make-scatterplot-matrices-in-python-8dc4998921c3), then we apply dimensionality reduction techniques to get 2D/3D representation of our data, and visualize the results with [scatter plots](https://plotly.com/python/line-and-scatter/) and [3D scatter plots](https://plotly.com/python/3d-scatter-plots/). + + ## Basic t-SNE projections +t-SNE is a popular dimensionality reduction algorithm that arises from probability theory. Simply put, it projects the high-dimensional data points (sometimes with hundreds of features) into 2D/3D by inducing the projected data to have a similar distribution as the original data points by minimizing something called the [KL divergence](https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-understanding-kl-divergence-2b382ca2b2a8). + +Compared to a method like Principal Component Analysis (PCA), it takes signficantly more time to converge, but present signficiantly better insights when visualized. For example, by projecting features of a flowers, it will be able to distinctly group + ### Visualizing high-dimensional data with `px.scatter_matrix` +First, let's try to visualize every feature of the [Iris dataset](https://archive.ics.uci.edu/ml/datasets/iris), and color everything by the species. We will use the Scatter Plot Matrix ([splom](https://plotly.com/python/splom/)), which lets us plot each feature against everything else, which is convenient when your dataset has more than 3 dimensions. + ```python import plotly.express as px @@ -50,6 +61,8 @@ fig.show() ### Project data into 2D with t-SNE and `px.scatter` +Now, let's use the t-SNE algorithm to project the data shown above into two dimensions. Notice how each of the species is physically separate from each other. + ```python from sklearn.manifold import TSNE import plotly.express as px @@ -70,6 +83,8 @@ fig.show() ### Project data into 3D with t-SNE and `px.scatter_3d` +t-SNE can reduce your data to any number of dimensions you want! Here, we show you how to project it to 3D and visualize with a 3D scatter plot. + ```python from sklearn.manifold import TSNE import plotly.express as px @@ -125,7 +140,9 @@ fig_3d.show() ## Visualizing image datasets -In the following example, we show how to visualize large image datasets using UMAP. Here, we use [`load_digits`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html), a subset of the famous MNIST dataset that was downsized to 8x8 and flattened to 64 dimensions. +In the following example, we show how to visualize large image datasets using UMAP. Here, we use [`load_digits`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html), a subset of the famous [MNIST dataset](http://yann.lecun.com/exdb/mnist/) that was downsized to 8x8 and flattened to 64 dimensions. + +Although there's over 1000 data points, and many more dimensions than the previous example, it is still extremely fast. This is because UMAP is optimized for speed, both from a theoretical perspective, and in the way it is implemented. Learn more in [this comparison post](https://umap-learn.readthedocs.io/en/latest/benchmarking.html). ```python import plotly.express as px @@ -146,4 +163,18 @@ fig = px.scatter( fig.show() ``` -### Reference + +## Reference + +Plotly figures: +* https://plotly.com/python/line-and-scatter/ +* https://plotly.com/python/3d-scatter-plots/ +* https://plotly.com/python/splom/ + + +Details about algorithms: +* UMAP library: https://umap-learn.readthedocs.io/en/latest/ +* t-SNE User guide: https://scikit-learn.org/stable/modules/manifold.html#t-sne +* t-SNE paper: https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf +* MNIST: http://yann.lecun.com/exdb/mnist/ + From c419b09284b6f4aa6bbbc982ded08e955c59d742 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Thu, 13 Aug 2020 11:23:35 -0400 Subject: [PATCH 32/35] Add umap to requirements --- doc/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/requirements.txt b/doc/requirements.txt index 8b46ed96701..63860785d93 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -26,3 +26,4 @@ python-frontmatter datashader pyarrow cufflinks==0.17.3 +umap-learn \ No newline at end of file From caebf4951b900006a7499e0272ec1b32317d6028 Mon Sep 17 00:00:00 2001 From: Nicolas Kruchten Date: Tue, 18 Aug 2020 13:35:17 -0400 Subject: [PATCH 33/35] fixups --- doc/python/ml-knn.md | 70 +++++++++---------- doc/python/ml-pca.md | 18 +++-- doc/python/ml-regression.md | 36 +++++----- doc/python/ml-roc-pr.md | 93 +++++++++++++------------- doc/python/ml-tsne-umap-projections.md | 23 +++---- 5 files changed, 124 insertions(+), 116 deletions(-) diff --git a/doc/python/ml-knn.md b/doc/python/ml-knn.md index 6c86de79240..6d823cf49b1 100644 --- a/doc/python/ml-knn.md +++ b/doc/python/ml-knn.md @@ -5,8 +5,8 @@ jupyter: text_representation: extension: .md format_name: markdown - format_version: '1.1' - jupytext_version: 1.1.1 + format_version: '1.2' + jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python @@ -28,8 +28,8 @@ jupyter: language: python layout: base name: kNN Classification - order: 1 - page_type: example_index + order: 2 + page_type: u-guide permalink: python/knn-classification/ thumbnail: thumbnail/knn-classification.png --- @@ -49,10 +49,11 @@ Using Scikit-learn, we first generate synthetic data that form the shape of a mo In the graph, we display all the negative labels as squares, and positive labels as circles. We differentiate the training and test set by adding a dot to the center of test data. +In this example, we will use [graph objects](/python/graph-objects/), Plotly's low-level API for building figures. + ```python -import numpy as np -import plotly.express as px import plotly.graph_objects as go +import numpy as np from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier @@ -72,13 +73,13 @@ trace_specs = [ fig = go.Figure(data=[ go.Scatter( x=X[y==label, 0], y=X[y==label, 1], - name=f'{split} Split, Label {label}', + name=f'{split} Split, Label {label}', mode='markers', marker_symbol=marker ) for X, y, label, split, marker in trace_specs ]) fig.update_traces( - marker_size=12, marker_line_width=1.5, + marker_size=12, marker_line_width=1.5, marker_color="lightyellow" ) fig.show() @@ -89,12 +90,11 @@ fig.show() Now, we train the kNN model on the same training data displayed in the previous graph. Then, we predict the confidence score of the model for each of the data points in the test set. We will use shapes to denote the true labels, and the color will indicate the confidence of the model for assign that score. -Notice that `px.scatter` only require 1 function call to plot both negative and positive labels, and can additionally set a continuous color scale based on the `y_score` output by our kNN model. +In this example, we will use [Plotly Express](/python/plotly-express/), Plotly's high-level API for building figures. Notice that `px.scatter` only require 1 function call to plot both negative and positive labels, and can additionally set a continuous color scale based on the `y_score` output by our kNN model. ```python -import numpy as np import plotly.express as px -import plotly.graph_objects as go +import numpy as np from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier @@ -110,7 +110,7 @@ clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test)[:, 1] fig = px.scatter( - X_test, x=0, y=1, + X_test, x=0, y=1, color=y_score, color_continuous_scale='RdBu', symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'}, labels={'symbol': 'label', 'color': 'score of
first class'} @@ -124,14 +124,15 @@ fig.show() Just like the previous example, we will first train our kNN model on the training set. -Instead of predicting the conference for the test set, we can predict the confidence map for the entire area that wraps around the dimensions of our dataset. To do this, we use [`np.meshgrid`](https://numpy.org/doc/stable/reference/generated/numpy.meshgrid.html) to create a grid, where the distance between each point is denoted by the `mesh_size` variable. +Instead of predicting the conference for the test set, we can predict the confidence map for the entire area that wraps around the dimensions of our dataset. To do this, we use [`np.meshgrid`](https://numpy.org/doc/stable/reference/generated/numpy.meshgrid.html) to create a grid, where the distance between each point is denoted by the `mesh_size` variable. Then, for each of those points, we will use our model to give a confidence score, and plot it with a [contour plot](https://plotly.com/python/contour-plots/). +In this example, we will use [graph objects](/python/graph-objects/), Plotly's low-level API for building figures. + ```python -import numpy as np -import plotly.express as px import plotly.graph_objects as go +import numpy as np from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier @@ -161,11 +162,11 @@ Z = Z.reshape(xx.shape) # Plot the figure fig = go.Figure(data=[ go.Contour( - x=xrange, - y=yrange, - z=Z, + x=xrange, + y=yrange, + z=Z, colorscale='RdBu' - ) + ) ]) fig.show() ``` @@ -173,9 +174,8 @@ fig.show() Now, let's try to combine our `go.Contour` plot with the first scatter plot of our data points, so that we can visually compare the confidence of our model with the true labels. ```python -import numpy as np -import plotly.express as px import plotly.graph_objects as go +import numpy as np from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier @@ -211,21 +211,21 @@ trace_specs = [ fig = go.Figure(data=[ go.Scatter( x=X[y==label, 0], y=X[y==label, 1], - name=f'{split} Split, Label {label}', + name=f'{split} Split, Label {label}', mode='markers', marker_symbol=marker ) for X, y, label, split, marker in trace_specs ]) fig.update_traces( - marker_size=12, marker_line_width=1.5, + marker_size=12, marker_line_width=1.5, marker_color="lightyellow" ) fig.add_trace( go.Contour( - x=xrange, - y=yrange, - z=Z, + x=xrange, + y=yrange, + z=Z, showscale=False, colorscale='RdBu', opacity=0.4, @@ -240,10 +240,12 @@ fig.show() It is also possible to visualize the prediction confidence of the model using [heatmaps](https://plotly.com/python/heatmaps/). In this example, you can see how to compute how confident the model is about its prediction at every point in the 2D grid. Here, we define the confidence as the difference between the highest score and the score of the other classes summed, at a certain point. +In this example, we will use [Plotly Express](/python/plotly-express/), Plotly's high-level API for building figures. + ```python -import numpy as np import plotly.express as px import plotly.graph_objects as go +import numpy as np from sklearn.neighbors import KNeighborsClassifier mesh_size = .02 @@ -275,21 +277,21 @@ diff = proba.max(axis=-1) - (proba.sum(axis=-1) - proba.max(axis=-1)) fig = px.scatter( df_test, x='sepal_length', y='sepal_width', - symbol='species', + symbol='species', symbol_map={ - 'setosa': 'square-dot', - 'versicolor': 'circle-dot', + 'setosa': 'square-dot', + 'versicolor': 'circle-dot', 'virginica': 'diamond-dot'}, ) fig.update_traces( - marker_size=12, marker_line_width=1.5, + marker_size=12, marker_line_width=1.5, marker_color="lightyellow" ) fig.add_trace( go.Heatmap( - x=lrange, - y=wrange, - z=diff, + x=lrange, + y=wrange, + z=diff, opacity=0.25, customdata=proba, colorscale='RdBu', diff --git a/doc/python/ml-pca.md b/doc/python/ml-pca.md index 5395fd3dc04..3737a2c5e3a 100644 --- a/doc/python/ml-pca.md +++ b/doc/python/ml-pca.md @@ -29,7 +29,7 @@ jupyter: layout: base name: PCA Visualization order: 4 - page_type: example_index + page_type: u-guide permalink: python/pca-visualization/ thumbnail: thumbnail/ml-pca.png --- @@ -52,6 +52,8 @@ First, let's plot all the features and see how the `species` in the Iris dataset In our example, we are plotting all 4 features from the Iris dataset, thus we can see how `sepal_width` is compared against `sepal_length`, then against `petal_width`, and so forth. Keep in mind how some pairs of features can more easily separate different species. +In this example, we will use [Plotly Express](/python/plotly-express/), Plotly's high-level API for building figures. + ```python import plotly.express as px @@ -69,10 +71,12 @@ fig.show() ### Visualize all the principal components -Now, we apply `PCA` the same dataset, and retrieve **all** the components. We use the same `px.scatter_matrix` trace to display our results, but this time our features are the resulting *principal components*, ordered by how much variance they are able to explain. +Now, we apply `PCA` the same dataset, and retrieve **all** the components. We use the same `px.scatter_matrix` trace to display our results, but this time our features are the resulting *principal components*, ordered by how much variance they are able to explain. The importance of explained variance is demonstrated in the example below. The subplot between PC3 and PC4 is clearly unable to separate each class, whereas the subplot between PC1 and PC2 shows a clear separation between each species. +In this example, we will use [Plotly Express](/python/plotly-express/), Plotly's high-level API for building figures. + ```python import plotly.express as px from sklearn.decomposition import PCA @@ -83,7 +87,7 @@ features = ["sepal_width", "sepal_length", "petal_width", "petal_length"] pca = PCA() components = pca.fit_transform(df[features]) labels = { - str(i): f"PC {i+1} ({var:.1f}%)" + str(i): f"PC {i+1} ({var:.1f}%)" for i, var in enumerate(pca.explained_variance_ratio_ * 100) } @@ -122,7 +126,7 @@ labels = {str(i): f"PC {i+1}" for i in range(n_components)} labels['color'] = 'Median Price' fig = px.scatter_matrix( - components, + components, color=boston.target, dimensions=range(n_components), labels=labels, @@ -167,7 +171,7 @@ components = pca.fit_transform(X) total_var = pca.explained_variance_ratio_.sum() * 100 fig = px.scatter_3d( - components, x=0, y=1, z=2, color=df['species'], + components, x=0, y=1, z=2, color=df['species'], title=f'Total Explained Variance: {total_var:.2f}%', labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'} ) @@ -181,9 +185,9 @@ Often, you might be interested in seeing how much variance PCA is able to explai With a higher explained variance, you are able to capture more variability in your dataset, which could potentially lead to better performance when training your model. For a more mathematical explanation, see this [Q&A thread](https://stats.stackexchange.com/questions/22569/pca-and-proportion-of-variance-explained). ```python +import plotly.express as px import numpy as np import pandas as pd -import plotly.express as px from sklearn.decomposition import PCA from sklearn.datasets import load_diabetes @@ -196,7 +200,7 @@ exp_var_cumul = np.cumsum(pca.explained_variance_ratio_) px.area( x=range(1, exp_var_cumul.shape[0] + 1), - y=exp_var_cumul, + y=exp_var_cumul, labels={"x": "# Components", "y": "Explained Variance"} ) ``` diff --git a/doc/python/ml-regression.md b/doc/python/ml-regression.md index 2dd99e0b067..42215c7ad9d 100644 --- a/doc/python/ml-regression.md +++ b/doc/python/ml-regression.md @@ -5,8 +5,8 @@ jupyter: text_representation: extension: .md format_name: markdown - format_version: '1.1' - jupytext_version: 1.1.1 + format_version: '1.2' + jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python @@ -27,8 +27,8 @@ jupyter: language: python layout: base name: ML Regression - order: 2 - page_type: example_index + order: 1 + page_type: u-guide permalink: python/ml-regression/ thumbnail: thumbnail/ml-regression.png --- @@ -52,7 +52,7 @@ We will be using the [Linear Regression][lr], which is a simple model that fit a ### Ordinary Least Square (OLS) with `plotly.express` -This example shows how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS) for predicting the tips waiters will receive based on the value of the total bill. +This example shows [how to use `plotly.express`'s `trendline` parameter to train a simply Ordinary Least Square (OLS)](/python/linear-fits/) for predicting the tips waiters will receive based on the value of the total bill. [lr]: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html @@ -174,11 +174,11 @@ from sklearn.preprocessing import PolynomialFeatures def format_coefs(coefs): equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)] equation = "$" + " + ".join(equation_list) + "$" - + replace_map = {"x^0": "", "x^1": "x", '+ -': '- '} for old, new in replace_map.items(): equation = equation.replace(old, new) - + return equation df = px.data.tips() @@ -195,7 +195,7 @@ for degree in [1, 2, 3, 4]: model = LinearRegression(fit_intercept=False) model.fit(X_poly, df.tip) y_poly = model.predict(x_range_poly) - + equation = format_coefs(model.coef_.round(2)) fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation)) @@ -300,7 +300,7 @@ y_pred = model.predict(X) fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'}) fig.add_shape( type="line", line=dict(dash='dash'), - x0=y.min(), y0=y.min(), + x0=y.min(), y0=y.min(), x1=y.max(), y1=y.max() ) fig.show() @@ -341,7 +341,7 @@ fig = px.scatter( fig.update_traces(histnorm='probability', selector={'type':'histogram'}) fig.add_shape( type="line", line=dict(dash='dash'), - x0=y.min(), y0=y.min(), + x0=y.min(), y0=y.min(), x1=y.max(), y1=y.max() ) @@ -411,14 +411,14 @@ mean_alphas = model.mse_path_.mean(axis=-1) fig = go.Figure([ go.Scatter( - x=model.alphas_, y=model.mse_path_[:, i], + x=model.alphas_, y=model.mse_path_[:, i], name=f"Fold: {i+1}", opacity=.5, line=dict(dash='dash'), hovertemplate="alpha: %{x}
MSE: %{y}" ) for i in range(N_FOLD) ]) fig.add_traces(go.Scatter( - x=model.alphas_, y=mean_alphas, + x=model.alphas_, y=mean_alphas, name='Mean', line=dict(color='black', width=3), hovertemplate="alpha: %{x}
MSE: %{y}", )) @@ -431,8 +431,8 @@ fig.add_shape( ) fig.update_layout( - xaxis_title='alpha', - xaxis_type="log", + xaxis_title='alpha', + xaxis_type="log", yaxis_title="Mean Square Error (MSE)" ) fig.show() @@ -462,14 +462,14 @@ y = df['petal_width'] # Define and fit the grid model = DecisionTreeRegressor() param_grid = { - 'criterion': ['mse', 'friedman_mse', 'mae'], + 'criterion': ['mse', 'friedman_mse', 'mae'], 'max_depth': range(2, 5) } grid = GridSearchCV(model, param_grid, cv=N_FOLD) grid.fit(X, y) grid_df = pd.DataFrame(grid.cv_results_) -# Convert the wide format of the grid into the long format +# Convert the wide format of the grid into the long format # accepted by plotly.express melted = ( grid_df @@ -491,7 +491,7 @@ melted['cv_split'] = ( # Single function call to plot each figure fig_hmap = px.density_heatmap( - melted, x="max_depth", y='criterion', + melted, x="max_depth", y='criterion', histfunc="sum", z="r_squared", title='Grid search results on individual fold', hover_data=['mean_fit_time'], @@ -500,7 +500,7 @@ fig_hmap = px.density_heatmap( ) fig_box = px.box( - melted, x='max_depth', y='r_squared', + melted, x='max_depth', y='r_squared', title='Grid search results ', hover_data=['mean_fit_time'], points='all', diff --git a/doc/python/ml-roc-pr.md b/doc/python/ml-roc-pr.md index 21abf071880..eced1074109 100644 --- a/doc/python/ml-roc-pr.md +++ b/doc/python/ml-roc-pr.md @@ -5,8 +5,8 @@ jupyter: text_representation: extension: .md format_name: markdown - format_version: '1.1' - jupytext_version: 1.1.1 + format_version: '1.2' + jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python @@ -29,7 +29,7 @@ jupyter: layout: base name: ROC and PR Curves order: 3 - page_type: example_index + page_type: u-guide permalink: python/roc-and-pr-curves/ thumbnail: thumbnail/ml-roc-pr.png --- @@ -38,13 +38,13 @@ jupyter: Before diving into the receiver operating characteristic (ROC) curve, we will look at two plots that will give some context to the thresholds mechanism behind the ROC and PR curves. -In the histogram, we observe that the score spread such that most of the positive labels are binned near 1, and a lot of the negative labels are close to 0. When we set a threshold on the score, all of the bins to its left will be classified as 0's, and everything to the right will be 1's. There are obviously a few outliers, such as **negative** samples that our model gave a high score, and *positive* samples with a low score. If we set a threshold right in the middle, those outliers will respectively become **false positives** and *false negatives*. +In the histogram, we observe that the score spread such that most of the positive labels are binned near 1, and a lot of the negative labels are close to 0. When we set a threshold on the score, all of the bins to its left will be classified as 0's, and everything to the right will be 1's. There are obviously a few outliers, such as **negative** samples that our model gave a high score, and *positive* samples with a low score. If we set a threshold right in the middle, those outliers will respectively become **false positives** and *false negatives*. As we adjust thresholds, the number of positive positives will increase or decrease, and at the same time the number of true positives will also change; this is shown in the second plot. As you can see, the model seems to perform fairly well, because the true positive rate decreases slowly, whereas the false positive rate decreases sharply as we increase the threshold. Those two lines each represent a dimension of the ROC curve. ```python import plotly.express as px -import plotly.graph_objects as go +import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve, auc from sklearn.datasets import make_classification @@ -58,26 +58,28 @@ fpr, tpr, thresholds = roc_curve(y, y_score) # The histogram of scores compared to true labels fig_hist = px.histogram( - x=y_score, color=y, nbins=50, + x=y_score, color=y, nbins=50, labels=dict(color='True Labels', x='Score') ) +fig_hist.show() + + # Evaluating model performance at various thresholds -fig_thresh = go.Figure([ - go.Scatter(x=thresholds, y=fpr, name='False Positive Rate'), - go.Scatter(x=thresholds, y=tpr, name='True Positive Rate') -]) -fig_thresh.update_layout( - title='TPR and FPR at every threshold', - xaxis_title='Threshold', - yaxis_title='Rate', - yaxis=dict(scaleanchor="x", scaleratio=1), - xaxis=dict(constrain='domain') +df = pd.DataFrame({ + 'False Positive Rate': fpr, + 'True Positive Rate': tpr +}, index=thresholds) +df.index.name = "Thresholds" +df.columns.name = "Rate" + +fig_thresh = px.line( + df, title='TPR and FPR at every threshold', + width=700, height=500 ) -fig_thresh.update_xaxes(range=[0, 1]) -# Display plots -fig_hist.show() +fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1) +fig_thresh.update_xaxes(range=[0, 1], constrain='domain') fig_thresh.show() ``` @@ -102,18 +104,18 @@ y_score = model.predict_proba(X)[:, 1] fpr, tpr, thresholds = roc_curve(y, y_score) fig = px.area( - x=fpr, y=tpr, + x=fpr, y=tpr, title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})', - labels=dict(x='False Positive Rate', y='True Positive Rate') + labels=dict(x='False Positive Rate', y='True Positive Rate'), + width=700, height=500 ) fig.add_shape( - type='line', line=dict(dash='dash'), + type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1 ) -fig.update_layout( - yaxis=dict(scaleanchor="x", scaleratio=1), - xaxis=dict(constrain='domain') -) + +fig.update_yaxes(scaleanchor="x", scaleratio=1) +fig.update_xaxes(constrain='domain') fig.show() ``` @@ -122,12 +124,12 @@ fig.show() When you have more than 2 classes, you will need to plot the ROC curve for each class separately. Make sure that you use a [one-versus-rest](https://scikit-learn.org/stable/modules/multiclass.html#one-vs-the-rest) model, or make sure that your problem has a [multi-label](https://scikit-learn.org/stable/modules/multiclass.html#multilabel-classification-format) format; otherwise, your ROC curve might not return the expected results. ```python +import plotly.graph_objects as go +import plotly.express as px import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve, roc_auc_score -import plotly.graph_objects as go -import plotly.express as px np.random.seed(0) @@ -151,17 +153,17 @@ y_scores = model.predict_proba(X) # every time we compute a new class fig = go.Figure() fig.add_shape( - type='line', line=dict(dash='dash'), + type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1 ) for i in range(y_scores.shape[1]): y_true = y_onehot.iloc[:, i] y_score = y_scores[:, i] - + fpr, tpr, _ = roc_curve(y_true, y_score) auc_score = roc_auc_score(y_true, y_score) - + name = f"{y_onehot.columns[i]} (AUC={auc_score:.2f})" fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) @@ -169,7 +171,8 @@ fig.update_layout( xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', yaxis=dict(scaleanchor="x", scaleratio=1), - xaxis=dict(constrain='domain') + xaxis=dict(constrain='domain'), + width=700, height=500 ) fig.show() ``` @@ -193,18 +196,17 @@ y_score = model.predict_proba(X)[:, 1] precision, recall, thresholds = precision_recall_curve(y, y_score) fig = px.area( - x=recall, y=precision, + x=recall, y=precision, title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})', - labels=dict(x='Recall', y='Precision') + labels=dict(x='Recall', y='Precision'), + width=700, height=500 ) fig.add_shape( - type='line', line=dict(dash='dash'), + type='line', line=dict(dash='dash'), x0=0, x1=1, y0=1, y1=0 ) -fig.update_layout( - yaxis=dict(scaleanchor="x", scaleratio=1), - xaxis=dict(constrain='domain') -) +fig.update_yaxes(scaleanchor="x", scaleratio=1) +fig.update_xaxes(constrain='domain') fig.show() ``` @@ -212,12 +214,12 @@ fig.show() In this example, we use the [average precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html) metric, which is an alternative scoring method to the area under the PR curve. ```python +import plotly.graph_objects as go +import plotly.express as px import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import precision_recall_curve, average_precision_score -import plotly.graph_objects as go -import plotly.express as px np.random.seed(0) @@ -241,17 +243,17 @@ y_scores = model.predict_proba(X) # every time we compute a new class fig = go.Figure() fig.add_shape( - type='line', line=dict(dash='dash'), + type='line', line=dict(dash='dash'), x0=0, x1=1, y0=1, y1=0 ) for i in range(y_scores.shape[1]): y_true = y_onehot.iloc[:, i] y_score = y_scores[:, i] - + precision, recall, _ = precision_recall_curve(y_true, y_score) auc_score = average_precision_score(y_true, y_score) - + name = f"{y_onehot.columns[i]} (AP={auc_score:.2f})" fig.add_trace(go.Scatter(x=recall, y=precision, name=name, mode='lines')) @@ -259,7 +261,8 @@ fig.update_layout( xaxis_title='Recall', yaxis_title='Precision', yaxis=dict(scaleanchor="x", scaleratio=1), - xaxis=dict(constrain='domain') + xaxis=dict(constrain='domain'), + width=700, height=500 ) fig.show() ``` diff --git a/doc/python/ml-tsne-umap-projections.md b/doc/python/ml-tsne-umap-projections.md index db0ef55b5d0..26ca99d51b4 100644 --- a/doc/python/ml-tsne-umap-projections.md +++ b/doc/python/ml-tsne-umap-projections.md @@ -22,28 +22,27 @@ jupyter: pygments_lexer: ipython3 version: 3.7.7 plotly: - description: Visualize scikit-learn's k-Nearest Neighbors (kNN) classification - in Python with Plotly. + description: Visualize scikit-learn's t-SNE and UMAP in Python with Plotly. display_as: ai_ml language: python layout: base name: t-SNE and UMAP projections - order: 1 - page_type: example_index + order: 5 + page_type: u-guide permalink: python/t-sne-and-umap-projections/ thumbnail: thumbnail/tsne-umap-projections.png --- -This page presents various ways to visualize two popular dimensionality reduction techniques, namely the [t-distributed stochastic neighbor embedding](https://lvdmaaten.github.io/tsne/) (t-SNE) and [Uniform Manifold Approximation and Projection](https://umap-learn.readthedocs.io/en/latest/index.html) (UMAP). They are needed whenever you want to visualize data with more than two or three features (i.e. dimensions). +This page presents various ways to visualize two popular dimensionality reduction techniques, namely the [t-distributed stochastic neighbor embedding](https://lvdmaaten.github.io/tsne/) (t-SNE) and [Uniform Manifold Approximation and Projection](https://umap-learn.readthedocs.io/en/latest/index.html) (UMAP). They are needed whenever you want to visualize data with more than two or three features (i.e. dimensions). -We first show how to visualize data with more than three features using the [scatter plot matrix](https://medium.com/plotly/what-is-a-splom-chart-make-scatterplot-matrices-in-python-8dc4998921c3), then we apply dimensionality reduction techniques to get 2D/3D representation of our data, and visualize the results with [scatter plots](https://plotly.com/python/line-and-scatter/) and [3D scatter plots](https://plotly.com/python/3d-scatter-plots/). +We first show how to visualize data with more than three features using the [scatter plot matrix](https://medium.com/plotly/what-is-a-splom-chart-make-scatterplot-matrices-in-python-8dc4998921c3), then we apply dimensionality reduction techniques to get 2D/3D representation of our data, and visualize the results with [scatter plots](https://plotly.com/python/line-and-scatter/) and [3D scatter plots](https://plotly.com/python/3d-scatter-plots/). ## Basic t-SNE projections -t-SNE is a popular dimensionality reduction algorithm that arises from probability theory. Simply put, it projects the high-dimensional data points (sometimes with hundreds of features) into 2D/3D by inducing the projected data to have a similar distribution as the original data points by minimizing something called the [KL divergence](https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-understanding-kl-divergence-2b382ca2b2a8). +t-SNE is a popular dimensionality reduction algorithm that arises from probability theory. Simply put, it projects the high-dimensional data points (sometimes with hundreds of features) into 2D/3D by inducing the projected data to have a similar distribution as the original data points by minimizing something called the [KL divergence](https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-understanding-kl-divergence-2b382ca2b2a8). -Compared to a method like Principal Component Analysis (PCA), it takes signficantly more time to converge, but present signficiantly better insights when visualized. For example, by projecting features of a flowers, it will be able to distinctly group +Compared to a method like Principal Component Analysis (PCA), it takes signficantly more time to converge, but present signficiantly better insights when visualized. For example, by projecting features of a flowers, it will be able to distinctly group ### Visualizing high-dimensional data with `px.scatter_matrix` @@ -75,7 +74,7 @@ tsne = TSNE(n_components=2, random_state=0) projections = tsne.fit_transform(features) fig = px.scatter( - projections, x=0, y=1, + projections, x=0, y=1, color=df.species, labels={'color': 'species'} ) fig.show() @@ -97,7 +96,7 @@ tsne = TSNE(n_components=3, random_state=0) projections = tsne.fit_transform(features, ) fig = px.scatter_3d( - projections, x=0, y=1, z=2, + projections, x=0, y=1, z=2, color=df.species, labels={'color': 'species'} ) fig.update_traces(marker_size=8) @@ -129,7 +128,7 @@ fig_2d = px.scatter( color=df.species, labels={'color': 'species'} ) fig_3d = px.scatter_3d( - proj_3d, x=0, y=1, z=2, + proj_3d, x=0, y=1, z=2, color=df.species, labels={'color': 'species'} ) fig_3d.update_traces(marker_size=5) @@ -157,7 +156,7 @@ umap_2d.fit(digits.data) projections = umap_2d.transform(digits.data) fig = px.scatter( - projections, x=0, y=1, + projections, x=0, y=1, color=digits.target.astype(str), labels={'color': 'digit'} ) fig.show() From f3507e4326c4bf0feb2a78c037e8a1f7fbe8add8 Mon Sep 17 00:00:00 2001 From: Nicolas Kruchten Date: Tue, 18 Aug 2020 13:55:30 -0400 Subject: [PATCH 34/35] longer timeout for umap --- doc/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/Makefile b/doc/Makefile index 5e9861159a1..996f18ebdaa 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -39,7 +39,8 @@ $(HTML_DIR)/2019-07-03-%.html: $(IPYNB_DIR)/%.ipynb @echo "[nbconvert] $<" @jupyter nbconvert $< --to html --template nb.tpl \ --output-dir $(HTML_DIR) --output 2019-07-03-$*.html \ - --execute > $(FAIL_DIR)/$* 2>&1 && rm -f $(FAIL_DIR)/$* + --execute > $(FAIL_DIR)/$* 2>&1 && rm -f $(FAIL_DIR)/$* \ + --ExecutePreprocessor.timeout=600 $(REDIR_DIR)/2019-07-03-redirect-next-%.html: $(IPYNB_DIR)/%.ipynb From 53de99c7632dd42e8384ab5d9458a837e92abdcd Mon Sep 17 00:00:00 2001 From: Nicolas Kruchten Date: Tue, 18 Aug 2020 14:29:27 -0400 Subject: [PATCH 35/35] longer timeout for umap --- doc/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/Makefile b/doc/Makefile index 996f18ebdaa..4390fed7e0b 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -38,9 +38,9 @@ $(HTML_DIR)/2019-07-03-%.html: $(IPYNB_DIR)/%.ipynb @mkdir -p $(FAIL_DIR) @echo "[nbconvert] $<" @jupyter nbconvert $< --to html --template nb.tpl \ + --ExecutePreprocessor.timeout=600\ --output-dir $(HTML_DIR) --output 2019-07-03-$*.html \ - --execute > $(FAIL_DIR)/$* 2>&1 && rm -f $(FAIL_DIR)/$* \ - --ExecutePreprocessor.timeout=600 + --execute > $(FAIL_DIR)/$* 2>&1 && rm -f $(FAIL_DIR)/$* $(REDIR_DIR)/2019-07-03-redirect-next-%.html: $(IPYNB_DIR)/%.ipynb