From 692d896a4342156c4ed0f49655dcd7bbce96930f Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 19 Apr 2020 19:41:46 -0400 Subject: [PATCH 01/11] DOC Adjusts toc on left --- doc/conf.py | 3 +- doc/contents.rst | 5 +- doc/data_transforms.rst | 3 + doc/datasets/index.rst | 3 + doc/developers/index.rst | 5 +- doc/inspection.rst | 3 + doc/model_selection.rst | 3 + doc/modules/computing.rst | 3 + doc/preface.rst | 8 +- doc/sphinxext/add_toctree_functions.py | 119 ++++++++++++++++++ doc/supervised_learning.rst | 3 + doc/themes/scikit-learn-modern/layout.html | 39 ++++-- .../scikit-learn-modern/static/css/theme.css | 4 + doc/tutorial/index.rst | 9 -- doc/unsupervised_learning.rst | 3 + doc/user_guide.rst | 5 +- doc/visualizations.rst | 3 + 17 files changed, 189 insertions(+), 32 deletions(-) create mode 100644 doc/sphinxext/add_toctree_functions.py diff --git a/doc/conf.py b/doc/conf.py index c3ab17d3e73af..4ffe88e09e34c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -38,7 +38,8 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.imgconverter', 'sphinx_gallery.gen_gallery', - 'sphinx_issues' + 'sphinx_issues', + 'add_toctree_functions', ] # this is needed for some reason... diff --git a/doc/contents.rst b/doc/contents.rst index a28634621d558..75c16f3e0b13d 100644 --- a/doc/contents.rst +++ b/doc/contents.rst @@ -1,9 +1,8 @@ .. include:: includes/big_toc_css.rst .. include:: tune_toc.rst -.. Places global toc into the sidebar - -:globalsidebartoc: True +.. Places parent toc into the sidebar +:parenttoc: True ================= Table Of Contents diff --git a/doc/data_transforms.rst b/doc/data_transforms.rst index 01547f68008b6..50c93ac935276 100644 --- a/doc/data_transforms.rst +++ b/doc/data_transforms.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. include:: includes/big_toc_css.rst .. _data-transforms: diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 88ae88d7a3151..d68c3ea348338 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. _datasets: ========================= diff --git a/doc/developers/index.rst b/doc/developers/index.rst index e64adf5ac73a9..92c102a1da3da 100644 --- a/doc/developers/index.rst +++ b/doc/developers/index.rst @@ -1,6 +1,5 @@ -.. Places global toc into the sidebar - -:globalsidebartoc: True +.. Places parent toc into the sidebar +:parenttoc: True .. _developers_guide: diff --git a/doc/inspection.rst b/doc/inspection.rst index 1304a1030abb9..3c63ca67782fc 100644 --- a/doc/inspection.rst +++ b/doc/inspection.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. include:: includes/big_toc_css.rst .. _inspection: diff --git a/doc/model_selection.rst b/doc/model_selection.rst index 7b540072c15e5..97b2e40be8cb6 100644 --- a/doc/model_selection.rst +++ b/doc/model_selection.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. include:: includes/big_toc_css.rst .. _model_selection: diff --git a/doc/modules/computing.rst b/doc/modules/computing.rst index 246085d436cde..209d9e544a1a1 100644 --- a/doc/modules/computing.rst +++ b/doc/modules/computing.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + ============================ Computing with scikit-learn ============================ diff --git a/doc/preface.rst b/doc/preface.rst index 447083a3a8136..9773a53e0f2b4 100644 --- a/doc/preface.rst +++ b/doc/preface.rst @@ -1,9 +1,8 @@ .. This helps define the TOC ordering for "about us" sections. Particularly useful for PDF output as this section is not linked from elsewhere. -.. Places global toc into the sidebar - -:globalsidebartoc: True +.. Places parent toc into the sidebar +:parenttoc: True .. _preface_menu: @@ -14,8 +13,6 @@ Welcome to scikit-learn ======================= -| - .. toctree:: :maxdepth: 2 @@ -29,4 +26,3 @@ Welcome to scikit-learn roadmap governance -| diff --git a/doc/sphinxext/add_toctree_functions.py b/doc/sphinxext/add_toctree_functions.py new file mode 100644 index 0000000000000..c8c4611d80e13 --- /dev/null +++ b/doc/sphinxext/add_toctree_functions.py @@ -0,0 +1,119 @@ +import docutils + + +def add_toctree_functions(app, pagename, templatename, context, doctree): + """Add functions so Jinja templates can add toctree objects. + + This converts the docutils nodes into a nested dictionary that Jinja can + use in our templating. + """ + from sphinx.environment.adapters.toctree import TocTree + + def get_nav_object(maxdepth=None, collapse=True, **kwargs): + """Return a list of nav links that can be accessed from Jinja. + + Parameters + ---------- + maxdepth: int + How many layers of TocTree will be returned + collapse: bool + Whether to only include sub-pages of the currently-active page, + instead of sub-pages of all top-level pages of the site. + kwargs: key/val pairs + Passed to the `TocTree.get_toctree_for` Sphinx method + """ + # The TocTree will contain the full site TocTree including sub-pages. + # "collapse=True" collapses sub-pages of non-active TOC pages. + # maxdepth controls how many TOC levels are returned + toctree = TocTree(app.env).get_toctree_for( + pagename, app.builder, collapse=collapse, maxdepth=maxdepth, **kwargs + ) + # If no toctree is defined (AKA a single-page site), skip this + if toctree is None: + return [] + + # toctree has this structure + # + # + # + # + # `list_item`s are the actual TOC links and are the only thing we want + toc_items = [item for child in toctree.children for item in child + if isinstance(item, docutils.nodes.list_item)] + + # Now convert our docutils nodes into dicts that Jinja can use + nav = [docutils_node_to_jinja(child, only_pages=True) + for child in toc_items] + + return nav + + def get_page_toc_object(): + """Return a list of within-page TOC links that can be accessed from Jinja.""" + self_toc = TocTree(app.env).get_toc_for(pagename, app.builder) + + try: + nav = docutils_node_to_jinja(self_toc.children[0]) + return nav + except: + return {} + + context["get_nav_object"] = get_nav_object + context["get_page_toc_object"] = get_page_toc_object + + +def docutils_node_to_jinja(list_item, only_pages=False): + """Convert a docutils node to a structure that can be read by Jinja. + + Parameters + ---------- + list_item : docutils list_item node + A parent item, potentially with children, corresponding to the level + of a TocTree. + only_pages : bool + Only include items for full pages in the output dictionary. Exclude + anchor links (TOC items with a URL that starts with #) + + Returns + ------- + nav : dict + The TocTree, converted into a dictionary with key/values that work + within Jinja. + """ + if not list_item.children: + return None + + # We assume this structure of a list item: + # + # + # <-- the thing we want + reference = list_item.children[0].children[0] + title = reference.astext() + url = reference.attributes["refuri"] + active = "current" in list_item.attributes["classes"] + + # If we've got an anchor link, skip it if we wish + if only_pages and '#' in url: + return None + + # Converting the docutils attributes into jinja-friendly objects + nav = {} + nav["title"] = title + nav["url"] = url + nav["active"] = active + + # Recursively convert children as well + # If there are sub-pages for this list_item, there should be two children: + # a paragraph, and a bullet_list. + nav["children"] = [] + if len(list_item.children) > 1: + # The `.children` of the bullet_list has the nodes of the sub-pages. + subpage_list = list_item.children[1].children + for sub_page in subpage_list: + child_nav = docutils_node_to_jinja(sub_page, only_pages=only_pages) + if child_nav is not None: + nav["children"].append(child_nav) + return nav + + +def setup(app): + app.connect("html-page-context", add_toctree_functions) diff --git a/doc/supervised_learning.rst b/doc/supervised_learning.rst index b89e9e033e96b..0236320028a98 100644 --- a/doc/supervised_learning.rst +++ b/doc/supervised_learning.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. include:: includes/big_toc_css.rst .. _supervised-learning: diff --git a/doc/themes/scikit-learn-modern/layout.html b/doc/themes/scikit-learn-modern/layout.html index 6f29cf52f7c91..5519b98ab65fc 100644 --- a/doc/themes/scikit-learn-modern/layout.html +++ b/doc/themes/scikit-learn-modern/layout.html @@ -87,15 +87,40 @@ Please cite us if you use the software.

- {%- if meta and meta['globalsidebartoc']|tobool %} -
- {{ toctree(maxdepth=2, titles_only=True) }} -
- {%- else %}
- {{ toc }} + {%- if meta and meta['parenttoc']|tobool %} + {% set nav = get_nav_object(maxdepth=5, collapse=True) %} +
    + {% for main_nav_item in nav %} + {% if main_nav_item.active %} +
  • + {{ main_nav_item.title }} +
  • +
      + {% for nav_item in main_nav_item.children %} +
    • + {{ nav_item.title }} + {% if nav_item.children %} + + {% endif %} +
    • + {% endfor %} +
    + {% endif %} + {% endfor %} +
+ {%- else %} +
+ {{ toc }} +
+ {%- endif %}
- {%- endif %}
diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 2b80d6fe2b762..f1e490a12b3d9 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -511,6 +511,10 @@ div.sk-sidebar-toc-logo { height: 52px; } +.sk-toc-active { + font-weight: bold; +} + div.sk-sidebar-toc-wrapper { font-size: 0.9rem; width: 252px; diff --git a/doc/tutorial/index.rst b/doc/tutorial/index.rst index cfd63719321f2..b9bf27b6921bb 100644 --- a/doc/tutorial/index.rst +++ b/doc/tutorial/index.rst @@ -1,10 +1,5 @@ -.. Places global toc into the sidebar - -:globalsidebartoc: True - .. _tutorial_menu: - .. include:: ../includes/big_toc_css.rst .. include:: ../tune_toc.rst @@ -12,8 +7,6 @@ scikit-learn Tutorials ====================== -| - .. toctree:: :maxdepth: 2 @@ -23,8 +16,6 @@ scikit-learn Tutorials machine_learning_map/index ../presentations -| - .. note:: **Doctest Mode** The code-examples in the above tutorials are written in a diff --git a/doc/unsupervised_learning.rst b/doc/unsupervised_learning.rst index e09e13ef1a942..5e5998c9ddf48 100644 --- a/doc/unsupervised_learning.rst +++ b/doc/unsupervised_learning.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. include:: includes/big_toc_css.rst .. _unsupervised-learning: diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 48679aa961782..6e9f06fa18446 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -1,6 +1,5 @@ -.. Places global toc into the sidebar - -:globalsidebartoc: True +.. Places parent toc into the sidebar +:parenttoc: True .. title:: User guide: contents diff --git a/doc/visualizations.rst b/doc/visualizations.rst index 47d826602b62f..3c3b8e985dd0f 100644 --- a/doc/visualizations.rst +++ b/doc/visualizations.rst @@ -1,3 +1,6 @@ +.. Places parent toc into the sidebar +:parenttoc: True + .. include:: includes/big_toc_css.rst .. _visualizations: From 1c620db4e5a060c0d7a8360b0a5225cfd5a71171 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 19 Apr 2020 19:52:45 -0400 Subject: [PATCH 02/11] DOC Adds support to nav --- doc/themes/scikit-learn-modern/nav.html | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/themes/scikit-learn-modern/nav.html b/doc/themes/scikit-learn-modern/nav.html index 57c631f6cbee7..b85e1df1bd66a 100644 --- a/doc/themes/scikit-learn-modern/nav.html +++ b/doc/themes/scikit-learn-modern/nav.html @@ -12,6 +12,7 @@ ('Glossary', pathto('glossary')), ('Development', pathto('developers/index')), ('FAQ', pathto('faq')), + ('Support', pathto('support')), ('Related packages', pathto('related_projects')), ('Roadmap', pathto('roadmap')), ('About us', pathto('about')), From 230fcba61d7c5b4536672ea20f763f939aff2212 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Sun, 19 Apr 2020 19:56:32 -0400 Subject: [PATCH 03/11] DOC Adds license --- doc/sphinxext/add_toctree_functions.py | 48 +++++++++++++++++++------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/doc/sphinxext/add_toctree_functions.py b/doc/sphinxext/add_toctree_functions.py index c8c4611d80e13..efdce234e32a7 100644 --- a/doc/sphinxext/add_toctree_functions.py +++ b/doc/sphinxext/add_toctree_functions.py @@ -1,3 +1,36 @@ +"""Inspired by https://github.com/pandas-dev/pydata-sphinx-theme + +BSD 3-Clause License + +Copyright (c) 2018, pandas +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + import docutils @@ -26,8 +59,8 @@ def get_nav_object(maxdepth=None, collapse=True, **kwargs): # "collapse=True" collapses sub-pages of non-active TOC pages. # maxdepth controls how many TOC levels are returned toctree = TocTree(app.env).get_toctree_for( - pagename, app.builder, collapse=collapse, maxdepth=maxdepth, **kwargs - ) + pagename, app.builder, collapse=collapse, maxdepth=maxdepth, + **kwargs) # If no toctree is defined (AKA a single-page site), skip this if toctree is None: return [] @@ -47,18 +80,7 @@ def get_nav_object(maxdepth=None, collapse=True, **kwargs): return nav - def get_page_toc_object(): - """Return a list of within-page TOC links that can be accessed from Jinja.""" - self_toc = TocTree(app.env).get_toc_for(pagename, app.builder) - - try: - nav = docutils_node_to_jinja(self_toc.children[0]) - return nav - except: - return {} - context["get_nav_object"] = get_nav_object - context["get_page_toc_object"] = get_page_toc_object def docutils_node_to_jinja(list_item, only_pages=False): From 46f124cba5869f5268e4feaf4126447e41fad793 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 20 Apr 2020 20:33:25 -0400 Subject: [PATCH 04/11] STY Fixes rst formating --- doc/contents.rst | 1 + doc/data_transforms.rst | 1 + doc/datasets/index.rst | 1 + doc/developers/index.rst | 1 + doc/inspection.rst | 1 + doc/model_selection.rst | 1 + doc/modules/computing.rst | 1 + doc/preface.rst | 1 + doc/supervised_learning.rst | 1 + doc/unsupervised_learning.rst | 1 + doc/user_guide.rst | 1 + doc/visualizations.rst | 1 + 12 files changed, 12 insertions(+) diff --git a/doc/contents.rst b/doc/contents.rst index 75c16f3e0b13d..829fbc7e3dc98 100644 --- a/doc/contents.rst +++ b/doc/contents.rst @@ -2,6 +2,7 @@ .. include:: tune_toc.rst .. Places parent toc into the sidebar + :parenttoc: True ================= diff --git a/doc/data_transforms.rst b/doc/data_transforms.rst index 50c93ac935276..084214cb094f5 100644 --- a/doc/data_transforms.rst +++ b/doc/data_transforms.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. include:: includes/big_toc_css.rst diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index d68c3ea348338..ff66fd20ed9d7 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. _datasets: diff --git a/doc/developers/index.rst b/doc/developers/index.rst index 92c102a1da3da..a9e691968a6ff 100644 --- a/doc/developers/index.rst +++ b/doc/developers/index.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. _developers_guide: diff --git a/doc/inspection.rst b/doc/inspection.rst index 3c63ca67782fc..72305bec73a10 100644 --- a/doc/inspection.rst +++ b/doc/inspection.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. include:: includes/big_toc_css.rst diff --git a/doc/model_selection.rst b/doc/model_selection.rst index 97b2e40be8cb6..04e41c454419e 100644 --- a/doc/model_selection.rst +++ b/doc/model_selection.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. include:: includes/big_toc_css.rst diff --git a/doc/modules/computing.rst b/doc/modules/computing.rst index 209d9e544a1a1..f15f4c95fa6ac 100644 --- a/doc/modules/computing.rst +++ b/doc/modules/computing.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True ============================ diff --git a/doc/preface.rst b/doc/preface.rst index 9773a53e0f2b4..08100c028e40c 100644 --- a/doc/preface.rst +++ b/doc/preface.rst @@ -2,6 +2,7 @@ useful for PDF output as this section is not linked from elsewhere. .. Places parent toc into the sidebar + :parenttoc: True .. _preface_menu: diff --git a/doc/supervised_learning.rst b/doc/supervised_learning.rst index 0236320028a98..d6e907f60cf84 100644 --- a/doc/supervised_learning.rst +++ b/doc/supervised_learning.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. include:: includes/big_toc_css.rst diff --git a/doc/unsupervised_learning.rst b/doc/unsupervised_learning.rst index 5e5998c9ddf48..9c1de0c134623 100644 --- a/doc/unsupervised_learning.rst +++ b/doc/unsupervised_learning.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. include:: includes/big_toc_css.rst diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 6e9f06fa18446..cd65983d1ee86 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. title:: User guide: contents diff --git a/doc/visualizations.rst b/doc/visualizations.rst index 3c3b8e985dd0f..b16aafa9a05c8 100644 --- a/doc/visualizations.rst +++ b/doc/visualizations.rst @@ -1,4 +1,5 @@ .. Places parent toc into the sidebar + :parenttoc: True .. include:: includes/big_toc_css.rst From eab7f44713c98a93239f8f380a9e0d2b4e24c104 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 22 Apr 2020 19:17:28 -0400 Subject: [PATCH 05/11] ENH Adds global back in --- doc/contents.rst | 4 ++-- doc/preface.rst | 4 ++-- doc/themes/scikit-learn-modern/layout.html | 12 ++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/contents.rst b/doc/contents.rst index 829fbc7e3dc98..a28634621d558 100644 --- a/doc/contents.rst +++ b/doc/contents.rst @@ -1,9 +1,9 @@ .. include:: includes/big_toc_css.rst .. include:: tune_toc.rst -.. Places parent toc into the sidebar +.. Places global toc into the sidebar -:parenttoc: True +:globalsidebartoc: True ================= Table Of Contents diff --git a/doc/preface.rst b/doc/preface.rst index 08100c028e40c..ce3bf6c73092f 100644 --- a/doc/preface.rst +++ b/doc/preface.rst @@ -1,9 +1,9 @@ .. This helps define the TOC ordering for "about us" sections. Particularly useful for PDF output as this section is not linked from elsewhere. -.. Places parent toc into the sidebar +.. Places global toc into the sidebar -:parenttoc: True +:globalsidebartoc: True .. _preface_menu: diff --git a/doc/themes/scikit-learn-modern/layout.html b/doc/themes/scikit-learn-modern/layout.html index 5519b98ab65fc..e930bab6b53ce 100644 --- a/doc/themes/scikit-learn-modern/layout.html +++ b/doc/themes/scikit-learn-modern/layout.html @@ -87,10 +87,10 @@ Please cite us if you use the software.

-
{%- if meta and meta['parenttoc']|tobool %} +
{% set nav = get_nav_object(maxdepth=5, collapse=True) %} -
    +
      {% for main_nav_item in nav %} {% if main_nav_item.active %}
    • @@ -114,13 +114,17 @@
    {% endif %} {% endfor %} -
+ +
+ {%- elif meta and meta['globalsidebartoc']|tobool %} +
+ {{ toctree(maxdepth=2, titles_only=True) }} +
{%- else %}
{{ toc }}
{%- endif %} -
From 9755d58a9f27c36dcf3d2d7ebd8b40f909aadb32 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 22 Apr 2020 19:20:15 -0400 Subject: [PATCH 06/11] REV Less diffs --- doc/preface.rst | 3 +++ doc/tutorial/index.rst | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/doc/preface.rst b/doc/preface.rst index ce3bf6c73092f..447083a3a8136 100644 --- a/doc/preface.rst +++ b/doc/preface.rst @@ -14,6 +14,8 @@ Welcome to scikit-learn ======================= +| + .. toctree:: :maxdepth: 2 @@ -27,3 +29,4 @@ Welcome to scikit-learn roadmap governance +| diff --git a/doc/tutorial/index.rst b/doc/tutorial/index.rst index b9bf27b6921bb..cfd63719321f2 100644 --- a/doc/tutorial/index.rst +++ b/doc/tutorial/index.rst @@ -1,5 +1,10 @@ +.. Places global toc into the sidebar + +:globalsidebartoc: True + .. _tutorial_menu: + .. include:: ../includes/big_toc_css.rst .. include:: ../tune_toc.rst @@ -7,6 +12,8 @@ scikit-learn Tutorials ====================== +| + .. toctree:: :maxdepth: 2 @@ -16,6 +23,8 @@ scikit-learn Tutorials machine_learning_map/index ../presentations +| + .. note:: **Doctest Mode** The code-examples in the above tutorials are written in a From 25a9d27801cb83c6444d17c2212731163a3d7319 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 25 Jul 2020 11:51:08 -0400 Subject: [PATCH 07/11] DOC Adds section numbers --- doc/sphinxext/add_toctree_functions.py | 15 +++++++++++---- doc/themes/scikit-learn-modern/layout.html | 2 +- .../scikit-learn-modern/static/css/theme.css | 1 - 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/sphinxext/add_toctree_functions.py b/doc/sphinxext/add_toctree_functions.py index efdce234e32a7..88110e4207c73 100644 --- a/doc/sphinxext/add_toctree_functions.py +++ b/doc/sphinxext/add_toctree_functions.py @@ -42,7 +42,7 @@ def add_toctree_functions(app, pagename, templatename, context, doctree): """ from sphinx.environment.adapters.toctree import TocTree - def get_nav_object(maxdepth=None, collapse=True, **kwargs): + def get_nav_object(maxdepth=None, collapse=True, numbered=False, **kwargs): """Return a list of nav links that can be accessed from Jinja. Parameters @@ -75,7 +75,8 @@ def get_nav_object(maxdepth=None, collapse=True, **kwargs): if isinstance(item, docutils.nodes.list_item)] # Now convert our docutils nodes into dicts that Jinja can use - nav = [docutils_node_to_jinja(child, only_pages=True) + nav = [docutils_node_to_jinja(child, only_pages=True, + numbered=numbered) for child in toc_items] return nav @@ -83,7 +84,7 @@ def get_nav_object(maxdepth=None, collapse=True, **kwargs): context["get_nav_object"] = get_nav_object -def docutils_node_to_jinja(list_item, only_pages=False): +def docutils_node_to_jinja(list_item, only_pages=False, numbered=False): """Convert a docutils node to a structure that can be read by Jinja. Parameters @@ -113,6 +114,11 @@ def docutils_node_to_jinja(list_item, only_pages=False): url = reference.attributes["refuri"] active = "current" in list_item.attributes["classes"] + secnumber = reference.attributes.get("secnumber", None) + if numbered and secnumber is not None: + secnumber = ".".join(str(n) for n in secnumber) + title = f"{secnumber}. {title}" + # If we've got an anchor link, skip it if we wish if only_pages and '#' in url: return None @@ -131,7 +137,8 @@ def docutils_node_to_jinja(list_item, only_pages=False): # The `.children` of the bullet_list has the nodes of the sub-pages. subpage_list = list_item.children[1].children for sub_page in subpage_list: - child_nav = docutils_node_to_jinja(sub_page, only_pages=only_pages) + child_nav = docutils_node_to_jinja(sub_page, only_pages=only_pages, + numbered=numbered) if child_nav is not None: nav["children"].append(child_nav) return nav diff --git a/doc/themes/scikit-learn-modern/layout.html b/doc/themes/scikit-learn-modern/layout.html index c2fdaf1073c7a..a4b9733b68709 100644 --- a/doc/themes/scikit-learn-modern/layout.html +++ b/doc/themes/scikit-learn-modern/layout.html @@ -88,7 +88,7 @@
{%- if meta and meta['parenttoc']|tobool %}
- {% set nav = get_nav_object(maxdepth=5, collapse=True) %} + {% set nav = get_nav_object(maxdepth=3, collapse=True, numbered=True) %}
    {% for main_nav_item in nav %} {% if main_nav_item.active %} diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 860a4b2404661..db2acbc3a11bb 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -553,7 +553,6 @@ div.sk-sidebar-toc ul ul { } div.sk-sidebar-toc ul ul ul { - list-style: square; margin-left: 1rem; } From 34e6add724a401e03423ac7403d930943a395b79 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 25 Jul 2020 13:58:32 -0400 Subject: [PATCH 08/11] DOC Address comments --- doc/computing.rst | 14 + .../computational_performance.rst} | 347 ------------------ doc/computing/parallelism.rst | 210 +++++++++++ doc/computing/scaling_strategies.rst | 139 +++++++ doc/datasets.rst | 34 ++ doc/datasets/general.rst | 45 +++ .../{index.rst => loading_other_datasets.rst} | 259 +------------ doc/datasets/real_world.rst | 44 +++ doc/datasets/sample_generators.rst | 121 ++++++ doc/datasets/toy_dataset.rst | 46 +++ doc/user_guide.rst | 4 +- 11 files changed, 657 insertions(+), 606 deletions(-) create mode 100644 doc/computing.rst rename doc/{modules/computing.rst => computing/computational_performance.rst} (54%) create mode 100644 doc/computing/parallelism.rst create mode 100644 doc/computing/scaling_strategies.rst create mode 100644 doc/datasets.rst create mode 100644 doc/datasets/general.rst rename doc/datasets/{index.rst => loading_other_datasets.rst} (55%) create mode 100644 doc/datasets/real_world.rst create mode 100644 doc/datasets/sample_generators.rst create mode 100644 doc/datasets/toy_dataset.rst diff --git a/doc/computing.rst b/doc/computing.rst new file mode 100644 index 0000000000000..6e15da5c11837 --- /dev/null +++ b/doc/computing.rst @@ -0,0 +1,14 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +============================ +Computing with scikit-learn +============================ + +.. toctree:: + :maxdepth: 2 + + computing/scaling_strategies + computing/computational_performance + computing/parallelism diff --git a/doc/modules/computing.rst b/doc/computing/computational_performance.rst similarity index 54% rename from doc/modules/computing.rst rename to doc/computing/computational_performance.rst index f15f4c95fa6ac..48fddf1c43f2d 100644 --- a/doc/modules/computing.rst +++ b/doc/computing/computational_performance.rst @@ -2,146 +2,6 @@ :parenttoc: True -============================ -Computing with scikit-learn -============================ - -.. _scaling_strategies: - -Strategies to scale computationally: bigger data -================================================= - -For some applications the amount of examples, features (or both) and/or the -speed at which they need to be processed are challenging for traditional -approaches. In these cases scikit-learn has a number of options you can -consider to make your system scale. - -Scaling with instances using out-of-core learning --------------------------------------------------- - -Out-of-core (or "external memory") learning is a technique used to learn from -data that cannot fit in a computer's main memory (RAM). - -Here is a sketch of a system designed to achieve this goal: - - 1. a way to stream instances - 2. a way to extract features from instances - 3. an incremental algorithm - -Streaming instances -.................... - -Basically, 1. may be a reader that yields instances from files on a -hard drive, a database, from a network stream etc. However, -details on how to achieve this are beyond the scope of this documentation. - -Extracting features -................... - -\2. could be any relevant way to extract features among the -different :ref:`feature extraction ` methods supported by -scikit-learn. However, when working with data that needs vectorization and -where the set of features or values is not known in advance one should take -explicit care. A good example is text classification where unknown terms are -likely to be found during training. It is possible to use a stateful -vectorizer if making multiple passes over the data is reasonable from an -application point of view. Otherwise, one can turn up the difficulty by using -a stateless feature extractor. Currently the preferred way to do this is to -use the so-called :ref:`hashing trick` as implemented by -:class:`sklearn.feature_extraction.FeatureHasher` for datasets with categorical -variables represented as list of Python dicts or -:class:`sklearn.feature_extraction.text.HashingVectorizer` for text documents. - -Incremental learning -..................... - -Finally, for 3. we have a number of options inside scikit-learn. Although not -all algorithms can learn incrementally (i.e. without seeing all the instances -at once), all estimators implementing the ``partial_fit`` API are candidates. -Actually, the ability to learn incrementally from a mini-batch of instances -(sometimes called "online learning") is key to out-of-core learning as it -guarantees that at any given time there will be only a small amount of -instances in the main memory. Choosing a good size for the mini-batch that -balances relevancy and memory footprint could involve some tuning [1]_. - -Here is a list of incremental estimators for different tasks: - - - Classification - + :class:`sklearn.naive_bayes.MultinomialNB` - + :class:`sklearn.naive_bayes.BernoulliNB` - + :class:`sklearn.linear_model.Perceptron` - + :class:`sklearn.linear_model.SGDClassifier` - + :class:`sklearn.linear_model.PassiveAggressiveClassifier` - + :class:`sklearn.neural_network.MLPClassifier` - - Regression - + :class:`sklearn.linear_model.SGDRegressor` - + :class:`sklearn.linear_model.PassiveAggressiveRegressor` - + :class:`sklearn.neural_network.MLPRegressor` - - Clustering - + :class:`sklearn.cluster.MiniBatchKMeans` - + :class:`sklearn.cluster.Birch` - - Decomposition / feature Extraction - + :class:`sklearn.decomposition.MiniBatchDictionaryLearning` - + :class:`sklearn.decomposition.IncrementalPCA` - + :class:`sklearn.decomposition.LatentDirichletAllocation` - - Preprocessing - + :class:`sklearn.preprocessing.StandardScaler` - + :class:`sklearn.preprocessing.MinMaxScaler` - + :class:`sklearn.preprocessing.MaxAbsScaler` - -For classification, a somewhat important thing to note is that although a -stateless feature extraction routine may be able to cope with new/unseen -attributes, the incremental learner itself may be unable to cope with -new/unseen targets classes. In this case you have to pass all the possible -classes to the first ``partial_fit`` call using the ``classes=`` parameter. - -Another aspect to consider when choosing a proper algorithm is that not all of -them put the same importance on each example over time. Namely, the -``Perceptron`` is still sensitive to badly labeled examples even after many -examples whereas the ``SGD*`` and ``PassiveAggressive*`` families are more -robust to this kind of artifacts. Conversely, the latter also tend to give less -importance to remarkably different, yet properly labeled examples when they -come late in the stream as their learning rate decreases over time. - -Examples -.......... - -Finally, we have a full-fledged example of -:ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. It is aimed at -providing a starting point for people wanting to build out-of-core learning -systems and demonstrates most of the notions discussed above. - -Furthermore, it also shows the evolution of the performance of different -algorithms with the number of processed examples. - -.. |accuracy_over_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_001.png - :target: ../auto_examples/applications/plot_out_of_core_classification.html - :scale: 80 - -.. centered:: |accuracy_over_time| - -Now looking at the computation time of the different parts, we see that the -vectorization is much more expensive than learning itself. From the different -algorithms, ``MultinomialNB`` is the most expensive, but its overhead can be -mitigated by increasing the size of the mini-batches (exercise: change -``minibatch_size`` to 100 and 10000 in the program and compare). - -.. |computation_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_003.png - :target: ../auto_examples/applications/plot_out_of_core_classification.html - :scale: 80 - -.. centered:: |computation_time| - - -Notes -...... - -.. [1] Depending on the algorithm the mini-batch size can influence results or - not. SGD*, PassiveAggressive*, and discrete NaiveBayes are truly online - and are not affected by batch size. Conversely, MiniBatchKMeans - convergence rate is affected by the batch size. Also, its memory - footprint can vary dramatically with batch size. - .. _computational_performance: Computational Performance @@ -506,210 +366,3 @@ Links - :ref:`scikit-learn developer performance documentation ` - `Scipy sparse matrix formats documentation `_ - -Parallelism, resource management, and configuration -=================================================== - -.. _parallelism: - -Parallelism ------------ - -Some scikit-learn estimators and utilities can parallelize costly operations -using multiple CPU cores, thanks to the following components: - -- via the `joblib `_ library. In - this case the number of threads or processes can be controlled with the - ``n_jobs`` parameter. -- via OpenMP, used in C or Cython code. - -In addition, some of the numpy routines that are used internally by -scikit-learn may also be parallelized if numpy is installed with specific -numerical libraries such as MKL, OpenBLAS, or BLIS. - -We describe these 3 scenarios in the following subsections. - -Joblib-based parallelism -........................ - -When the underlying implementation uses joblib, the number of workers -(threads or processes) that are spawned in parallel can be controlled via the -``n_jobs`` parameter. - -.. note:: - - Where (and how) parallelization happens in the estimators is currently - poorly documented. Please help us by improving our docs and tackle `issue - 14228 `_! - -Joblib is able to support both multi-processing and multi-threading. Whether -joblib chooses to spawn a thread or a process depends on the **backend** -that it's using. - -Scikit-learn generally relies on the ``loky`` backend, which is joblib's -default backend. Loky is a multi-processing backend. When doing -multi-processing, in order to avoid duplicating the memory in each process -(which isn't reasonable with big datasets), joblib will create a `memmap -`_ -that all processes can share, when the data is bigger than 1MB. - -In some specific cases (when the code that is run in parallel releases the -GIL), scikit-learn will indicate to ``joblib`` that a multi-threading -backend is preferable. - -As a user, you may control the backend that joblib will use (regardless of -what scikit-learn recommends) by using a context manager:: - - from joblib import parallel_backend - - with parallel_backend('threading', n_jobs=2): - # Your scikit-learn code here - -Please refer to the `joblib's docs -`_ -for more details. - -In practice, whether parallelism is helpful at improving runtime depends on -many factors. It is usually a good idea to experiment rather than assuming -that increasing the number of workers is always a good thing. In some cases -it can be highly detrimental to performance to run multiple copies of some -estimators or functions in parallel (see oversubscription below). - -OpenMP-based parallelism -........................ - -OpenMP is used to parallelize code written in Cython or C, relying on -multi-threading exclusively. By default (and unless joblib is trying to -avoid oversubscription), the implementation will use as many threads as -possible. - -You can control the exact number of threads that are used via the -``OMP_NUM_THREADS`` environment variable:: - - OMP_NUM_THREADS=4 python my_script.py - -Parallel Numpy routines from numerical libraries -................................................ - -Scikit-learn relies heavily on NumPy and SciPy, which internally call -multi-threaded linear algebra routines implemented in libraries such as MKL, -OpenBLAS or BLIS. - -The number of threads used by the OpenBLAS, MKL or BLIS libraries can be set -via the ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, and -``BLIS_NUM_THREADS`` environment variables. - -Please note that scikit-learn has no direct control over these -implementations. Scikit-learn solely relies on Numpy and Scipy. - -.. note:: - At the time of writing (2019), NumPy and SciPy packages distributed on - pypi.org (used by ``pip``) and on the conda-forge channel are linked - with OpenBLAS, while conda packages shipped on the "defaults" channel - from anaconda.org are linked by default with MKL. - - -Oversubscription: spawning too many threads -........................................... - -It is generally recommended to avoid using significantly more processes or -threads than the number of CPUs on a machine. Over-subscription happens when -a program is running too many threads at the same time. - -Suppose you have a machine with 8 CPUs. Consider a case where you're running -a :class:`~GridSearchCV` (parallelized with joblib) with ``n_jobs=8`` over -a :class:`~HistGradientBoostingClassifier` (parallelized with OpenMP). Each -instance of :class:`~HistGradientBoostingClassifier` will spawn 8 threads -(since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which -leads to oversubscription of physical CPU resources and to scheduling -overhead. - -Oversubscription can arise in the exact same fashion with parallelized -routines from MKL, OpenBLAS or BLIS that are nested in joblib calls. - -Starting from ``joblib >= 0.14``, when the ``loky`` backend is used (which -is the default), joblib will tell its child **processes** to limit the -number of threads they can use, so as to avoid oversubscription. In practice -the heuristic that joblib uses is to tell the processes to use ``max_threads -= n_cpus // n_jobs``, via their corresponding environment variable. Back to -our example from above, since the joblib backend of :class:`~GridSearchCV` -is ``loky``, each process will only be able to use 1 thread instead of 8, -thus mitigating the oversubscription issue. - -Note that: - -- Manually setting one of the environment variables (``OMP_NUM_THREADS``, - ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, or ``BLIS_NUM_THREADS``) - will take precedence over what joblib tries to do. The total number of - threads will be ``n_jobs * _NUM_THREADS``. Note that setting this - limit will also impact your computations in the main process, which will - only use ``_NUM_THREADS``. Joblib exposes a context manager for - finer control over the number of threads in its workers (see joblib docs - linked below). -- Joblib is currently unable to avoid oversubscription in a - multi-threading context. It can only do so with the ``loky`` backend - (which spawns processes). - -You will find additional details about joblib mitigation of oversubscription -in `joblib documentation -`_. - - -Configuration switches ------------------------ - -Python runtime -.............. - -:func:`sklearn.set_config` controls the following behaviors: - -:assume_finite: - - used to skip validation, which enables faster computations but may - lead to segmentation faults if the data contains NaNs. - -:working_memory: - - the optimal size of temporary arrays used by some algorithms. - -.. _environment_variable: - -Environment variables -...................... - -These environment variables should be set before importing scikit-learn. - -:SKLEARN_SITE_JOBLIB: - - When this environment variable is set to a non zero value, - scikit-learn uses the site joblib rather than its vendored version. - Consequently, joblib must be installed for scikit-learn to run. - Note that using the site joblib is at your own risks: the versions of - scikit-learn and joblib need to be compatible. Currently, joblib 0.11+ - is supported. In addition, dumps from joblib.Memory might be incompatible, - and you might loose some caches and have to redownload some datasets. - - .. deprecated:: 0.21 - - As of version 0.21 this parameter has no effect, vendored joblib was - removed and site joblib is always used. - -:SKLEARN_ASSUME_FINITE: - - Sets the default value for the `assume_finite` argument of - :func:`sklearn.set_config`. - -:SKLEARN_WORKING_MEMORY: - - Sets the default value for the `working_memory` argument of - :func:`sklearn.set_config`. - -:SKLEARN_SEED: - - Sets the seed of the global random generator when running the tests, - for reproducibility. - -:SKLEARN_SKIP_NETWORK_TESTS: - - When this environment variable is set to a non zero value, the tests - that need network access are skipped. diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst new file mode 100644 index 0000000000000..480e200560cb8 --- /dev/null +++ b/doc/computing/parallelism.rst @@ -0,0 +1,210 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +Parallelism, resource management, and configuration +=================================================== + +.. _parallelism: + +Parallelism +----------- + +Some scikit-learn estimators and utilities can parallelize costly operations +using multiple CPU cores, thanks to the following components: + +- via the `joblib `_ library. In + this case the number of threads or processes can be controlled with the + ``n_jobs`` parameter. +- via OpenMP, used in C or Cython code. + +In addition, some of the numpy routines that are used internally by +scikit-learn may also be parallelized if numpy is installed with specific +numerical libraries such as MKL, OpenBLAS, or BLIS. + +We describe these 3 scenarios in the following subsections. + +Joblib-based parallelism +........................ + +When the underlying implementation uses joblib, the number of workers +(threads or processes) that are spawned in parallel can be controlled via the +``n_jobs`` parameter. + +.. note:: + + Where (and how) parallelization happens in the estimators is currently + poorly documented. Please help us by improving our docs and tackle `issue + 14228 `_! + +Joblib is able to support both multi-processing and multi-threading. Whether +joblib chooses to spawn a thread or a process depends on the **backend** +that it's using. + +Scikit-learn generally relies on the ``loky`` backend, which is joblib's +default backend. Loky is a multi-processing backend. When doing +multi-processing, in order to avoid duplicating the memory in each process +(which isn't reasonable with big datasets), joblib will create a `memmap +`_ +that all processes can share, when the data is bigger than 1MB. + +In some specific cases (when the code that is run in parallel releases the +GIL), scikit-learn will indicate to ``joblib`` that a multi-threading +backend is preferable. + +As a user, you may control the backend that joblib will use (regardless of +what scikit-learn recommends) by using a context manager:: + + from joblib import parallel_backend + + with parallel_backend('threading', n_jobs=2): + # Your scikit-learn code here + +Please refer to the `joblib's docs +`_ +for more details. + +In practice, whether parallelism is helpful at improving runtime depends on +many factors. It is usually a good idea to experiment rather than assuming +that increasing the number of workers is always a good thing. In some cases +it can be highly detrimental to performance to run multiple copies of some +estimators or functions in parallel (see oversubscription below). + +OpenMP-based parallelism +........................ + +OpenMP is used to parallelize code written in Cython or C, relying on +multi-threading exclusively. By default (and unless joblib is trying to +avoid oversubscription), the implementation will use as many threads as +possible. + +You can control the exact number of threads that are used via the +``OMP_NUM_THREADS`` environment variable:: + + OMP_NUM_THREADS=4 python my_script.py + +Parallel Numpy routines from numerical libraries +................................................ + +Scikit-learn relies heavily on NumPy and SciPy, which internally call +multi-threaded linear algebra routines implemented in libraries such as MKL, +OpenBLAS or BLIS. + +The number of threads used by the OpenBLAS, MKL or BLIS libraries can be set +via the ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, and +``BLIS_NUM_THREADS`` environment variables. + +Please note that scikit-learn has no direct control over these +implementations. Scikit-learn solely relies on Numpy and Scipy. + +.. note:: + At the time of writing (2019), NumPy and SciPy packages distributed on + pypi.org (used by ``pip``) and on the conda-forge channel are linked + with OpenBLAS, while conda packages shipped on the "defaults" channel + from anaconda.org are linked by default with MKL. + + +Oversubscription: spawning too many threads +........................................... + +It is generally recommended to avoid using significantly more processes or +threads than the number of CPUs on a machine. Over-subscription happens when +a program is running too many threads at the same time. + +Suppose you have a machine with 8 CPUs. Consider a case where you're running +a :class:`~GridSearchCV` (parallelized with joblib) with ``n_jobs=8`` over +a :class:`~HistGradientBoostingClassifier` (parallelized with OpenMP). Each +instance of :class:`~HistGradientBoostingClassifier` will spawn 8 threads +(since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which +leads to oversubscription of physical CPU resources and to scheduling +overhead. + +Oversubscription can arise in the exact same fashion with parallelized +routines from MKL, OpenBLAS or BLIS that are nested in joblib calls. + +Starting from ``joblib >= 0.14``, when the ``loky`` backend is used (which +is the default), joblib will tell its child **processes** to limit the +number of threads they can use, so as to avoid oversubscription. In practice +the heuristic that joblib uses is to tell the processes to use ``max_threads += n_cpus // n_jobs``, via their corresponding environment variable. Back to +our example from above, since the joblib backend of :class:`~GridSearchCV` +is ``loky``, each process will only be able to use 1 thread instead of 8, +thus mitigating the oversubscription issue. + +Note that: + +- Manually setting one of the environment variables (``OMP_NUM_THREADS``, + ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, or ``BLIS_NUM_THREADS``) + will take precedence over what joblib tries to do. The total number of + threads will be ``n_jobs * _NUM_THREADS``. Note that setting this + limit will also impact your computations in the main process, which will + only use ``_NUM_THREADS``. Joblib exposes a context manager for + finer control over the number of threads in its workers (see joblib docs + linked below). +- Joblib is currently unable to avoid oversubscription in a + multi-threading context. It can only do so with the ``loky`` backend + (which spawns processes). + +You will find additional details about joblib mitigation of oversubscription +in `joblib documentation +`_. + + +Configuration switches +----------------------- + +Python runtime +.............. + +:func:`sklearn.set_config` controls the following behaviors: + +:assume_finite: + + used to skip validation, which enables faster computations but may + lead to segmentation faults if the data contains NaNs. + +:working_memory: + + the optimal size of temporary arrays used by some algorithms. + +.. _environment_variable: + +Environment variables +...................... + +These environment variables should be set before importing scikit-learn. + +:SKLEARN_SITE_JOBLIB: + + When this environment variable is set to a non zero value, + scikit-learn uses the site joblib rather than its vendored version. + Consequently, joblib must be installed for scikit-learn to run. + Note that using the site joblib is at your own risks: the versions of + scikit-learn and joblib need to be compatible. Currently, joblib 0.11+ + is supported. In addition, dumps from joblib.Memory might be incompatible, + and you might loose some caches and have to redownload some datasets. + + .. deprecated:: 0.21 + + As of version 0.21 this parameter has no effect, vendored joblib was + removed and site joblib is always used. + +:SKLEARN_ASSUME_FINITE: + + Sets the default value for the `assume_finite` argument of + :func:`sklearn.set_config`. + +:SKLEARN_WORKING_MEMORY: + + Sets the default value for the `working_memory` argument of + :func:`sklearn.set_config`. + +:SKLEARN_SEED: + + Sets the seed of the global random generator when running the tests, + for reproducibility. + +:SKLEARN_SKIP_NETWORK_TESTS: + + When this environment variable is set to a non zero value, the tests + that need network access are skipped. diff --git a/doc/computing/scaling_strategies.rst b/doc/computing/scaling_strategies.rst new file mode 100644 index 0000000000000..5eee5728e4b9a --- /dev/null +++ b/doc/computing/scaling_strategies.rst @@ -0,0 +1,139 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +.. _scaling_strategies: + +Strategies to scale computationally: bigger data +================================================= + +For some applications the amount of examples, features (or both) and/or the +speed at which they need to be processed are challenging for traditional +approaches. In these cases scikit-learn has a number of options you can +consider to make your system scale. + +Scaling with instances using out-of-core learning +-------------------------------------------------- + +Out-of-core (or "external memory") learning is a technique used to learn from +data that cannot fit in a computer's main memory (RAM). + +Here is a sketch of a system designed to achieve this goal: + + 1. a way to stream instances + 2. a way to extract features from instances + 3. an incremental algorithm + +Streaming instances +.................... + +Basically, 1. may be a reader that yields instances from files on a +hard drive, a database, from a network stream etc. However, +details on how to achieve this are beyond the scope of this documentation. + +Extracting features +................... + +\2. could be any relevant way to extract features among the +different :ref:`feature extraction ` methods supported by +scikit-learn. However, when working with data that needs vectorization and +where the set of features or values is not known in advance one should take +explicit care. A good example is text classification where unknown terms are +likely to be found during training. It is possible to use a stateful +vectorizer if making multiple passes over the data is reasonable from an +application point of view. Otherwise, one can turn up the difficulty by using +a stateless feature extractor. Currently the preferred way to do this is to +use the so-called :ref:`hashing trick` as implemented by +:class:`sklearn.feature_extraction.FeatureHasher` for datasets with categorical +variables represented as list of Python dicts or +:class:`sklearn.feature_extraction.text.HashingVectorizer` for text documents. + +Incremental learning +..................... + +Finally, for 3. we have a number of options inside scikit-learn. Although not +all algorithms can learn incrementally (i.e. without seeing all the instances +at once), all estimators implementing the ``partial_fit`` API are candidates. +Actually, the ability to learn incrementally from a mini-batch of instances +(sometimes called "online learning") is key to out-of-core learning as it +guarantees that at any given time there will be only a small amount of +instances in the main memory. Choosing a good size for the mini-batch that +balances relevancy and memory footprint could involve some tuning [1]_. + +Here is a list of incremental estimators for different tasks: + + - Classification + + :class:`sklearn.naive_bayes.MultinomialNB` + + :class:`sklearn.naive_bayes.BernoulliNB` + + :class:`sklearn.linear_model.Perceptron` + + :class:`sklearn.linear_model.SGDClassifier` + + :class:`sklearn.linear_model.PassiveAggressiveClassifier` + + :class:`sklearn.neural_network.MLPClassifier` + - Regression + + :class:`sklearn.linear_model.SGDRegressor` + + :class:`sklearn.linear_model.PassiveAggressiveRegressor` + + :class:`sklearn.neural_network.MLPRegressor` + - Clustering + + :class:`sklearn.cluster.MiniBatchKMeans` + + :class:`sklearn.cluster.Birch` + - Decomposition / feature Extraction + + :class:`sklearn.decomposition.MiniBatchDictionaryLearning` + + :class:`sklearn.decomposition.IncrementalPCA` + + :class:`sklearn.decomposition.LatentDirichletAllocation` + - Preprocessing + + :class:`sklearn.preprocessing.StandardScaler` + + :class:`sklearn.preprocessing.MinMaxScaler` + + :class:`sklearn.preprocessing.MaxAbsScaler` + +For classification, a somewhat important thing to note is that although a +stateless feature extraction routine may be able to cope with new/unseen +attributes, the incremental learner itself may be unable to cope with +new/unseen targets classes. In this case you have to pass all the possible +classes to the first ``partial_fit`` call using the ``classes=`` parameter. + +Another aspect to consider when choosing a proper algorithm is that not all of +them put the same importance on each example over time. Namely, the +``Perceptron`` is still sensitive to badly labeled examples even after many +examples whereas the ``SGD*`` and ``PassiveAggressive*`` families are more +robust to this kind of artifacts. Conversely, the latter also tend to give less +importance to remarkably different, yet properly labeled examples when they +come late in the stream as their learning rate decreases over time. + +Examples +.......... + +Finally, we have a full-fledged example of +:ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. It is aimed at +providing a starting point for people wanting to build out-of-core learning +systems and demonstrates most of the notions discussed above. + +Furthermore, it also shows the evolution of the performance of different +algorithms with the number of processed examples. + +.. |accuracy_over_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_001.png + :target: ../auto_examples/applications/plot_out_of_core_classification.html + :scale: 80 + +.. centered:: |accuracy_over_time| + +Now looking at the computation time of the different parts, we see that the +vectorization is much more expensive than learning itself. From the different +algorithms, ``MultinomialNB`` is the most expensive, but its overhead can be +mitigated by increasing the size of the mini-batches (exercise: change +``minibatch_size`` to 100 and 10000 in the program and compare). + +.. |computation_time| image:: ../auto_examples/applications/images/sphx_glr_plot_out_of_core_classification_003.png + :target: ../auto_examples/applications/plot_out_of_core_classification.html + :scale: 80 + +.. centered:: |computation_time| + + +Notes +...... + +.. [1] Depending on the algorithm the mini-batch size can influence results or + not. SGD*, PassiveAggressive*, and discrete NaiveBayes are truly online + and are not affected by batch size. Conversely, MiniBatchKMeans + convergence rate is affected by the batch size. Also, its memory + footprint can vary dramatically with batch size. diff --git a/doc/datasets.rst b/doc/datasets.rst new file mode 100644 index 0000000000000..68d3612354da1 --- /dev/null +++ b/doc/datasets.rst @@ -0,0 +1,34 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +.. include:: includes/big_toc_css.rst + +.. _datasets: + +========================= +Dataset loading utilities +========================= + +.. currentmodule:: sklearn.datasets + +The ``sklearn.datasets`` package embeds some small toy datasets +as introduced in the :ref:`Getting Started ` section. + +This package also features helpers to fetch larger datasets commonly +used by the machine learning community to benchmark algorithms on data +that comes from the 'real world'. + +To evaluate the impact of the scale of the dataset (``n_samples`` and +``n_features``) while controlling the statistical properties of the data +(typically the correlation and informativeness of the features), it is +also possible to generate synthetic data. + +.. toctree:: + :maxdepth: 2 + + datasets/general + datasets/toy_dataset + datasets/real_world + datasets/sample_generators + datasets/loading_other_datasets diff --git a/doc/datasets/general.rst b/doc/datasets/general.rst new file mode 100644 index 0000000000000..b31ebf3ef2eb7 --- /dev/null +++ b/doc/datasets/general.rst @@ -0,0 +1,45 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +General dataset API +=================== + +.. currentmodule:: sklearn.datasets + +There are three main kinds of dataset interfaces that can be used to get +datasets depending on the desired type of dataset. + +**The dataset loaders.** They can be used to load small standard datasets, +described in the :ref:`toy_datasets` section. + +**The dataset fetchers.** They can be used to download and load larger datasets, +described in the :ref:`real_world_datasets` section. + +Both loaders and fetchers functions return a :class:`~sklearn.utils.Bunch` +object holding at least two items: +an array of shape ``n_samples`` * ``n_features`` with +key ``data`` (except for 20newsgroups) and a numpy array of +length ``n_samples``, containing the target values, with key ``target``. + +The Bunch object is a dictionary that exposes its keys are attributes. +For more information about Bunch object, see :class:`~sklearn.utils.Bunch`: + +It's also possible for almost all of these function to constrain the output +to be a tuple containing only the data and the target, by setting the +``return_X_y`` parameter to ``True``. + +The datasets also contain a full description in their ``DESCR`` attribute and +some contain ``feature_names`` and ``target_names``. See the dataset +descriptions below for details. + +**The dataset generation functions.** They can be used to generate controlled +synthetic datasets, described in the :ref:`sample_generators` section. + +These functions return a tuple ``(X, y)`` consisting of a ``n_samples`` * +``n_features`` numpy array ``X`` and an array of length ``n_samples`` +containing the targets ``y``. + +In addition, there are also miscellaneous tools to load datasets of other +formats or from other locations, described in the :ref:`loading_other_datasets` +section. diff --git a/doc/datasets/index.rst b/doc/datasets/loading_other_datasets.rst similarity index 55% rename from doc/datasets/index.rst rename to doc/datasets/loading_other_datasets.rst index 7beccbfaf11ca..de4b9bb3f17b4 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/loading_other_datasets.rst @@ -2,268 +2,13 @@ :parenttoc: True -.. _datasets: - -========================= -Dataset loading utilities -========================= - -.. currentmodule:: sklearn.datasets - -The ``sklearn.datasets`` package embeds some small toy datasets -as introduced in the :ref:`Getting Started ` section. - -This package also features helpers to fetch larger datasets commonly -used by the machine learning community to benchmark algorithms on data -that comes from the 'real world'. - -To evaluate the impact of the scale of the dataset (``n_samples`` and -``n_features``) while controlling the statistical properties of the data -(typically the correlation and informativeness of the features), it is -also possible to generate synthetic data. - -General dataset API -=================== - -There are three main kinds of dataset interfaces that can be used to get -datasets depending on the desired type of dataset. - -**The dataset loaders.** They can be used to load small standard datasets, -described in the :ref:`toy_datasets` section. - -**The dataset fetchers.** They can be used to download and load larger datasets, -described in the :ref:`real_world_datasets` section. - -Both loaders and fetchers functions return a :class:`~sklearn.utils.Bunch` -object holding at least two items: -an array of shape ``n_samples`` * ``n_features`` with -key ``data`` (except for 20newsgroups) and a numpy array of -length ``n_samples``, containing the target values, with key ``target``. - -The Bunch object is a dictionary that exposes its keys are attributes. -For more information about Bunch object, see :class:`~sklearn.utils.Bunch`: - -It's also possible for almost all of these function to constrain the output -to be a tuple containing only the data and the target, by setting the -``return_X_y`` parameter to ``True``. - -The datasets also contain a full description in their ``DESCR`` attribute and -some contain ``feature_names`` and ``target_names``. See the dataset -descriptions below for details. - -**The dataset generation functions.** They can be used to generate controlled -synthetic datasets, described in the :ref:`sample_generators` section. - -These functions return a tuple ``(X, y)`` consisting of a ``n_samples`` * -``n_features`` numpy array ``X`` and an array of length ``n_samples`` -containing the targets ``y``. - -In addition, there are also miscellaneous tools to load datasets of other -formats or from other locations, described in the :ref:`loading_other_datasets` -section. - -.. _toy_datasets: - -Toy datasets -============ - -scikit-learn comes with a few small standard datasets that do not require to -download any file from some external website. - -They can be loaded using the following functions: - -.. autosummary:: - - :toctree: ../modules/generated/ - :template: function.rst - - load_boston - load_iris - load_diabetes - load_digits - load_linnerud - load_wine - load_breast_cancer - -These datasets are useful to quickly illustrate the behavior of the -various algorithms implemented in scikit-learn. They are however often too -small to be representative of real world machine learning tasks. - -.. include:: ../../sklearn/datasets/descr/boston_house_prices.rst - -.. include:: ../../sklearn/datasets/descr/iris.rst - -.. include:: ../../sklearn/datasets/descr/diabetes.rst - -.. include:: ../../sklearn/datasets/descr/digits.rst - -.. include:: ../../sklearn/datasets/descr/linnerud.rst - -.. include:: ../../sklearn/datasets/descr/wine_data.rst - -.. include:: ../../sklearn/datasets/descr/breast_cancer.rst - -.. _real_world_datasets: - -Real world datasets -=================== - -scikit-learn provides tools to load larger datasets, downloading them if -necessary. - -They can be loaded using the following functions: - -.. autosummary:: - - :toctree: ../modules/generated/ - :template: function.rst - - fetch_olivetti_faces - fetch_20newsgroups - fetch_20newsgroups_vectorized - fetch_lfw_people - fetch_lfw_pairs - fetch_covtype - fetch_rcv1 - fetch_kddcup99 - fetch_california_housing - -.. include:: ../../sklearn/datasets/descr/olivetti_faces.rst - -.. include:: ../../sklearn/datasets/descr/twenty_newsgroups.rst - -.. include:: ../../sklearn/datasets/descr/lfw.rst - -.. include:: ../../sklearn/datasets/descr/covtype.rst - -.. include:: ../../sklearn/datasets/descr/rcv1.rst - -.. include:: ../../sklearn/datasets/descr/kddcup99.rst - -.. include:: ../../sklearn/datasets/descr/california_housing.rst - -.. _sample_generators: - -Generated datasets -================== - -In addition, scikit-learn includes various random sample generators that -can be used to build artificial datasets of controlled size and complexity. - -Generators for classification and clustering --------------------------------------------- - -These generators produce a matrix of features and corresponding discrete -targets. - -Single label -~~~~~~~~~~~~ - -Both :func:`make_blobs` and :func:`make_classification` create multiclass -datasets by allocating each class one or more normally-distributed clusters of -points. :func:`make_blobs` provides greater control regarding the centers and -standard deviations of each cluster, and is used to demonstrate clustering. -:func:`make_classification` specialises in introducing noise by way of: -correlated, redundant and uninformative features; multiple Gaussian clusters -per class; and linear transformations of the feature space. - -:func:`make_gaussian_quantiles` divides a single Gaussian cluster into -near-equal-size classes separated by concentric hyperspheres. -:func:`make_hastie_10_2` generates a similar binary, 10-dimensional problem. - -.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_dataset_001.png - :target: ../auto_examples/datasets/plot_random_dataset.html - :scale: 50 - :align: center - -:func:`make_circles` and :func:`make_moons` generate 2d binary classification -datasets that are challenging to certain algorithms (e.g. centroid-based -clustering or linear classification), including optional Gaussian noise. -They are useful for visualisation. :func:`make_circles` produces Gaussian data -with a spherical decision boundary for binary classification, while -:func:`make_moons` produces two interleaving half circles. - -Multilabel -~~~~~~~~~~ - -:func:`make_multilabel_classification` generates random samples with multiple -labels, reflecting a bag of words drawn from a mixture of topics. The number of -topics for each document is drawn from a Poisson distribution, and the topics -themselves are drawn from a fixed random distribution. Similarly, the number of -words is drawn from Poisson, with words drawn from a multinomial, where each -topic defines a probability distribution over words. Simplifications with -respect to true bag-of-words mixtures include: - -* Per-topic word distributions are independently drawn, where in reality all - would be affected by a sparse base distribution, and would be correlated. -* For a document generated from multiple topics, all topics are weighted - equally in generating its bag of words. -* Documents without labels words at random, rather than from a base - distribution. - -.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_multilabel_dataset_001.png - :target: ../auto_examples/datasets/plot_random_multilabel_dataset.html - :scale: 50 - :align: center - -Biclustering -~~~~~~~~~~~~ - -.. autosummary:: - - :toctree: ../modules/generated/ - :template: function.rst - - make_biclusters - make_checkerboard - - -Generators for regression -------------------------- - -:func:`make_regression` produces regression targets as an optionally-sparse -random linear combination of random features, with noise. Its informative -features may be uncorrelated, or low rank (few features account for most of the -variance). - -Other regression generators generate functions deterministically from -randomized features. :func:`make_sparse_uncorrelated` produces a target as a -linear combination of four features with fixed coefficients. -Others encode explicitly non-linear relations: -:func:`make_friedman1` is related by polynomial and sine transforms; -:func:`make_friedman2` includes feature multiplication and reciprocation; and -:func:`make_friedman3` is similar with an arctan transformation on the target. - -Generators for manifold learning --------------------------------- - -.. autosummary:: - - :toctree: ../modules/generated/ - :template: function.rst - - make_s_curve - make_swiss_roll - -Generators for decomposition ----------------------------- - -.. autosummary:: - - :toctree: ../modules/generated/ - :template: function.rst - - make_low_rank_matrix - make_sparse_coded_signal - make_spd_matrix - make_sparse_spd_matrix - - .. _loading_other_datasets: Loading other datasets ====================== +.. currentmodule:: sklearn.datasets + .. _sample_images: Sample images diff --git a/doc/datasets/real_world.rst b/doc/datasets/real_world.rst new file mode 100644 index 0000000000000..8ec4f5ba0344b --- /dev/null +++ b/doc/datasets/real_world.rst @@ -0,0 +1,44 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +.. _real_world_datasets: + +Real world datasets +=================== + +.. currentmodule:: sklearn.datasets + +scikit-learn provides tools to load larger datasets, downloading them if +necessary. + +They can be loaded using the following functions: + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + fetch_olivetti_faces + fetch_20newsgroups + fetch_20newsgroups_vectorized + fetch_lfw_people + fetch_lfw_pairs + fetch_covtype + fetch_rcv1 + fetch_kddcup99 + fetch_california_housing + +.. include:: ../../sklearn/datasets/descr/olivetti_faces.rst + +.. include:: ../../sklearn/datasets/descr/twenty_newsgroups.rst + +.. include:: ../../sklearn/datasets/descr/lfw.rst + +.. include:: ../../sklearn/datasets/descr/covtype.rst + +.. include:: ../../sklearn/datasets/descr/rcv1.rst + +.. include:: ../../sklearn/datasets/descr/kddcup99.rst + +.. include:: ../../sklearn/datasets/descr/california_housing.rst diff --git a/doc/datasets/sample_generators.rst b/doc/datasets/sample_generators.rst new file mode 100644 index 0000000000000..6f56f4c21acc8 --- /dev/null +++ b/doc/datasets/sample_generators.rst @@ -0,0 +1,121 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +.. _sample_generators: + +Generated datasets +================== + +.. currentmodule:: sklearn.datasets + +In addition, scikit-learn includes various random sample generators that +can be used to build artificial datasets of controlled size and complexity. + +Generators for classification and clustering +-------------------------------------------- + +These generators produce a matrix of features and corresponding discrete +targets. + +Single label +~~~~~~~~~~~~ + +Both :func:`make_blobs` and :func:`make_classification` create multiclass +datasets by allocating each class one or more normally-distributed clusters of +points. :func:`make_blobs` provides greater control regarding the centers and +standard deviations of each cluster, and is used to demonstrate clustering. +:func:`make_classification` specialises in introducing noise by way of: +correlated, redundant and uninformative features; multiple Gaussian clusters +per class; and linear transformations of the feature space. + +:func:`make_gaussian_quantiles` divides a single Gaussian cluster into +near-equal-size classes separated by concentric hyperspheres. +:func:`make_hastie_10_2` generates a similar binary, 10-dimensional problem. + +.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_dataset_001.png + :target: ../auto_examples/datasets/plot_random_dataset.html + :scale: 50 + :align: center + +:func:`make_circles` and :func:`make_moons` generate 2d binary classification +datasets that are challenging to certain algorithms (e.g. centroid-based +clustering or linear classification), including optional Gaussian noise. +They are useful for visualisation. :func:`make_circles` produces Gaussian data +with a spherical decision boundary for binary classification, while +:func:`make_moons` produces two interleaving half circles. + +Multilabel +~~~~~~~~~~ + +:func:`make_multilabel_classification` generates random samples with multiple +labels, reflecting a bag of words drawn from a mixture of topics. The number of +topics for each document is drawn from a Poisson distribution, and the topics +themselves are drawn from a fixed random distribution. Similarly, the number of +words is drawn from Poisson, with words drawn from a multinomial, where each +topic defines a probability distribution over words. Simplifications with +respect to true bag-of-words mixtures include: + +* Per-topic word distributions are independently drawn, where in reality all + would be affected by a sparse base distribution, and would be correlated. +* For a document generated from multiple topics, all topics are weighted + equally in generating its bag of words. +* Documents without labels words at random, rather than from a base + distribution. + +.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_multilabel_dataset_001.png + :target: ../auto_examples/datasets/plot_random_multilabel_dataset.html + :scale: 50 + :align: center + +Biclustering +~~~~~~~~~~~~ + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_biclusters + make_checkerboard + + +Generators for regression +------------------------- + +:func:`make_regression` produces regression targets as an optionally-sparse +random linear combination of random features, with noise. Its informative +features may be uncorrelated, or low rank (few features account for most of the +variance). + +Other regression generators generate functions deterministically from +randomized features. :func:`make_sparse_uncorrelated` produces a target as a +linear combination of four features with fixed coefficients. +Others encode explicitly non-linear relations: +:func:`make_friedman1` is related by polynomial and sine transforms; +:func:`make_friedman2` includes feature multiplication and reciprocation; and +:func:`make_friedman3` is similar with an arctan transformation on the target. + +Generators for manifold learning +-------------------------------- + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_s_curve + make_swiss_roll + +Generators for decomposition +---------------------------- + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_low_rank_matrix + make_sparse_coded_signal + make_spd_matrix + make_sparse_spd_matrix diff --git a/doc/datasets/toy_dataset.rst b/doc/datasets/toy_dataset.rst new file mode 100644 index 0000000000000..f65464d85bc10 --- /dev/null +++ b/doc/datasets/toy_dataset.rst @@ -0,0 +1,46 @@ +.. Places parent toc into the sidebar + +:parenttoc: True + +.. _toy_datasets: + +Toy datasets +============ + +.. currentmodule:: sklearn.datasets + +scikit-learn comes with a few small standard datasets that do not require to +download any file from some external website. + +They can be loaded using the following functions: + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + load_boston + load_iris + load_diabetes + load_digits + load_linnerud + load_wine + load_breast_cancer + +These datasets are useful to quickly illustrate the behavior of the +various algorithms implemented in scikit-learn. They are however often too +small to be representative of real world machine learning tasks. + +.. include:: ../../sklearn/datasets/descr/boston_house_prices.rst + +.. include:: ../../sklearn/datasets/descr/iris.rst + +.. include:: ../../sklearn/datasets/descr/diabetes.rst + +.. include:: ../../sklearn/datasets/descr/digits.rst + +.. include:: ../../sklearn/datasets/descr/linnerud.rst + +.. include:: ../../sklearn/datasets/descr/wine_data.rst + +.. include:: ../../sklearn/datasets/descr/breast_cancer.rst diff --git a/doc/user_guide.rst b/doc/user_guide.rst index cd65983d1ee86..464b7918d7ba5 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -26,5 +26,5 @@ User Guide inspection.rst visualizations.rst data_transforms.rst - Dataset loading utilities - modules/computing.rst + datasets.rst + computing.rst From 4092a544e13dadd850f7ebcecbd59b2d294116c2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 25 Jul 2020 14:07:21 -0400 Subject: [PATCH 09/11] DOC Address comments --- doc/datasets.rst | 39 +++++++++++++++++++++++++++++++++- doc/datasets/general.rst | 45 ---------------------------------------- 2 files changed, 38 insertions(+), 46 deletions(-) delete mode 100644 doc/datasets/general.rst diff --git a/doc/datasets.rst b/doc/datasets.rst index 68d3612354da1..30efdae06b1e3 100644 --- a/doc/datasets.rst +++ b/doc/datasets.rst @@ -24,10 +24,47 @@ To evaluate the impact of the scale of the dataset (``n_samples`` and (typically the correlation and informativeness of the features), it is also possible to generate synthetic data. +**General dataset API.** There are three main kinds of dataset interfaces that +can be used to get datasets depending on the desired type of dataset. + +**The dataset loaders.** They can be used to load small standard datasets, +described in the :ref:`toy_datasets` section. + +**The dataset fetchers.** They can be used to download and load larger datasets, +described in the :ref:`real_world_datasets` section. + +Both loaders and fetchers functions return a :class:`~sklearn.utils.Bunch` +object holding at least two items: +an array of shape ``n_samples`` * ``n_features`` with +key ``data`` (except for 20newsgroups) and a numpy array of +length ``n_samples``, containing the target values, with key ``target``. + +The Bunch object is a dictionary that exposes its keys are attributes. +For more information about Bunch object, see :class:`~sklearn.utils.Bunch`: + +It's also possible for almost all of these function to constrain the output +to be a tuple containing only the data and the target, by setting the +``return_X_y`` parameter to ``True``. + +The datasets also contain a full description in their ``DESCR`` attribute and +some contain ``feature_names`` and ``target_names``. See the dataset +descriptions below for details. + +**The dataset generation functions.** They can be used to generate controlled +synthetic datasets, described in the :ref:`sample_generators` section. + +These functions return a tuple ``(X, y)`` consisting of a ``n_samples`` * +``n_features`` numpy array ``X`` and an array of length ``n_samples`` +containing the targets ``y``. + +In addition, there are also miscellaneous tools to load datasets of other +formats or from other locations, described in the :ref:`loading_other_datasets` +section. + + .. toctree:: :maxdepth: 2 - datasets/general datasets/toy_dataset datasets/real_world datasets/sample_generators diff --git a/doc/datasets/general.rst b/doc/datasets/general.rst deleted file mode 100644 index b31ebf3ef2eb7..0000000000000 --- a/doc/datasets/general.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. Places parent toc into the sidebar - -:parenttoc: True - -General dataset API -=================== - -.. currentmodule:: sklearn.datasets - -There are three main kinds of dataset interfaces that can be used to get -datasets depending on the desired type of dataset. - -**The dataset loaders.** They can be used to load small standard datasets, -described in the :ref:`toy_datasets` section. - -**The dataset fetchers.** They can be used to download and load larger datasets, -described in the :ref:`real_world_datasets` section. - -Both loaders and fetchers functions return a :class:`~sklearn.utils.Bunch` -object holding at least two items: -an array of shape ``n_samples`` * ``n_features`` with -key ``data`` (except for 20newsgroups) and a numpy array of -length ``n_samples``, containing the target values, with key ``target``. - -The Bunch object is a dictionary that exposes its keys are attributes. -For more information about Bunch object, see :class:`~sklearn.utils.Bunch`: - -It's also possible for almost all of these function to constrain the output -to be a tuple containing only the data and the target, by setting the -``return_X_y`` parameter to ``True``. - -The datasets also contain a full description in their ``DESCR`` attribute and -some contain ``feature_names`` and ``target_names``. See the dataset -descriptions below for details. - -**The dataset generation functions.** They can be used to generate controlled -synthetic datasets, described in the :ref:`sample_generators` section. - -These functions return a tuple ``(X, y)`` consisting of a ``n_samples`` * -``n_features`` numpy array ``X`` and an array of length ``n_samples`` -containing the targets ``y``. - -In addition, there are also miscellaneous tools to load datasets of other -formats or from other locations, described in the :ref:`loading_other_datasets` -section. From 288a743ba0f13b305937a724943e8f90332d8aec Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 25 Jul 2020 14:09:53 -0400 Subject: [PATCH 10/11] DOC Address comments --- doc/computing.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/computing.rst b/doc/computing.rst index 6e15da5c11837..6732b754918b0 100644 --- a/doc/computing.rst +++ b/doc/computing.rst @@ -6,6 +6,8 @@ Computing with scikit-learn ============================ +.. include:: includes/big_toc_css.rst + .. toctree:: :maxdepth: 2 From ea9b37e575fb83653431815c43141e192e81cb86 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 25 Jul 2020 23:37:08 -0400 Subject: [PATCH 11/11] DOC Adds docstring --- doc/sphinxext/add_toctree_functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/sphinxext/add_toctree_functions.py b/doc/sphinxext/add_toctree_functions.py index 88110e4207c73..b77788a5d98b4 100644 --- a/doc/sphinxext/add_toctree_functions.py +++ b/doc/sphinxext/add_toctree_functions.py @@ -52,6 +52,8 @@ def get_nav_object(maxdepth=None, collapse=True, numbered=False, **kwargs): collapse: bool Whether to only include sub-pages of the currently-active page, instead of sub-pages of all top-level pages of the site. + numbered: bool + Whether to add section number to title kwargs: key/val pairs Passed to the `TocTree.get_toctree_for` Sphinx method """ @@ -95,6 +97,8 @@ def docutils_node_to_jinja(list_item, only_pages=False, numbered=False): only_pages : bool Only include items for full pages in the output dictionary. Exclude anchor links (TOC items with a URL that starts with #) + numbered: bool + Whether to add section number to title Returns -------