diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index b9fd178..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.github/config.yml b/.github/config.yml index a0ce2a0..55dcf7c 100644 --- a/.github/config.yml +++ b/.github/config.yml @@ -4,7 +4,7 @@ # Comment to be posted to on first time issues newIssueWelcomeComment: | - [![Welcome Banner](https://zenodo.org/api/iiif/v2/0c0188d3-d03c-4830-a6e3-00405f5c22fa:8ff47a85-7250-4d86-8e48-2f346b48b2c1:BannerWelcome.jpg/full/750,/0/default.jpg)](https://zenodo.org/record/3695300) + ![Welcome Banner](https://raw.githubusercontent.com/scikit-learn/blog/main/welcome-bot/BannerWelcome.jpg) :tada: Welcome to _scikit-learn Blog_! :tada: We're really excited to have your input into the project! :sparkling_heart: @@ -16,7 +16,7 @@ newIssueWelcomeComment: | # Comment to be posted to on PRs from first time contributors in your repository newPRWelcomeComment: | - [![Thank You Banner](https://zenodo.org/api/iiif/v2/0c0188d3-d03c-4830-a6e3-00405f5c22fa:7fbd97cf-283b-480c-b8e1-11866e26245c:BannerThanks.jpg/full/750,/0/default.jpg)](https://zenodo.org/record/3695300) + ![Thank You Banner](https://raw.githubusercontent.com/scikit-learn/blog/main/welcome-bot/BannerThanks.jpg) :sparkling_heart: Thanks for opening this pull request! :sparkling_heart: _scikit-learn_ community really appreciates your time and effort to contribute to the project. @@ -27,7 +27,7 @@ newPRWelcomeComment: | # Comment to be posted to on pull requests merged by a first time user firstPRMergeComment: | - [![Congratulations Banner](https://zenodo.org/api/iiif/v2/0c0188d3-d03c-4830-a6e3-00405f5c22fa:32fbdb89-ae1b-434e-830c-88ade86724cc:BannerCongratulations.jpg/full/750,/0/default.jpg)](https://zenodo.org/record/3695300) + ![Congratulations Banner](https://raw.githubusercontent.com/scikit-learn/blog/main/welcome-bot/BannerCongratulations.jpg) Congrats on merging your first pull request! :tada: We here at _scikit-learn_ are proud of you! :sparkling_heart: diff --git a/.github/workflows/add_archives.yml b/.github/workflows/add_archives.yml index 393d838..607a7cb 100644 --- a/.github/workflows/add_archives.yml +++ b/.github/workflows/add_archives.yml @@ -10,7 +10,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Create required folders run: | diff --git a/.gitignore b/.gitignore index 14ae388..e6487b6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ _site vendor *.DS_Store .DS_Store +assets/.DS_Store assets/images/.DS_Store +.bundle diff --git a/Gemfile.lock b/Gemfile.lock index 9a35a42..845b586 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -14,67 +14,50 @@ GIT GEM remote: https://rubygems.org/ specs: - activesupport (6.0.4.4) + activesupport (6.1.7.10) concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) + i18n (>= 1.6, < 2) + minitest (>= 5.1) + tzinfo (~> 2.0) + zeitwerk (~> 2.3) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) + base64 (0.2.0) coffee-script (2.4.1) coffee-script-source execjs - coffee-script-source (1.11.1) + coffee-script-source (1.12.2) colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.1.9) - dnsruby (1.61.9) - simpleidn (~> 0.1) + commonmarker (0.23.11) + concurrent-ruby (1.3.5) + dnsruby (1.72.3) + base64 (~> 0.2.0) + simpleidn (~> 0.2.1) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) - ethon (0.15.0) + ethon (0.16.0) ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.8.1) - faraday (1.9.3) - faraday-em_http (~> 1.0) - faraday-em_synchrony (~> 1.0) - faraday-excon (~> 1.1) - faraday-httpclient (~> 1.0) - faraday-multipart (~> 1.0) - faraday-net_http (~> 1.0) - faraday-net_http_persistent (~> 1.0) - faraday-patron (~> 1.0) - faraday-rack (~> 1.0) - faraday-retry (~> 1.0) + execjs (2.10.0) + faraday (2.8.1) + base64 + faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) - faraday-em_http (1.0.0) - faraday-em_synchrony (1.0.0) - faraday-excon (1.1.0) - faraday-httpclient (1.0.1) - faraday-multipart (1.0.3) - multipart-post (>= 1.2, < 3) - faraday-net_http (1.0.1) - faraday-net_http_persistent (1.2.0) - faraday-patron (1.0.0) - faraday-rack (1.0.0) - faraday-retry (1.0.3) - ffi (1.15.5) + faraday-net_http (3.0.2) + ffi (1.17.1) forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (223) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.0) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) + gemoji (4.1.0) + github-pages (231) + github-pages-health-check (= 1.18.2) + jekyll (= 3.9.5) + jekyll-avatar (= 0.8.0) + jekyll-coffeescript (= 1.2.2) + jekyll-commonmark-ghpages (= 0.4.0) + jekyll-default-layout (= 0.1.5) + jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) + jekyll-github-metadata (= 2.16.1) jekyll-include-cache (= 0.2.1) jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) @@ -84,7 +67,7 @@ GEM jekyll-relative-links (= 0.6.1) jekyll-remote-theme (= 0.4.3) jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.7.1) + jekyll-seo-tag (= 2.8.0) jekyll-sitemap (= 1.4.0) jekyll-swiss (= 1.0.0) jekyll-theme-architect (= 0.2.0) @@ -101,32 +84,32 @@ GEM jekyll-theme-tactile (= 0.2.0) jekyll-theme-time-machine (= 0.2.0) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.1) + jemoji (= 0.13.0) + kramdown (= 2.4.0) kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) + liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.12.5, < 2.0) - rouge (= 3.26.0) + nokogiri (>= 1.13.6, < 2.0) + rouge (= 3.30.0) terminal-table (~> 1.4) - github-pages-health-check (1.17.9) + github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) + octokit (>= 4, < 8) + public_suffix (>= 3.0, < 6.0) typhoeus (~> 1.3) - html-pipeline (2.14.0) + html-pipeline (2.14.3) activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (0.9.5) + i18n (1.14.7) concurrent-ruby (~> 1.0) - jekyll (3.9.0) + jekyll (3.9.5) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) - i18n (~> 0.7) + i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) kramdown (>= 1.17, < 3) @@ -137,27 +120,27 @@ GEM safe_yaml (~> 1.0) jekyll-archives (2.2.1) jekyll (>= 3.6, < 5.0) - jekyll-avatar (0.7.0) + jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) + jekyll-coffeescript (1.2.2) coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) + coffee-script-source (~> 1.12) + jekyll-commonmark (1.4.0) + commonmarker (~> 0.22) + jekyll-commonmark-ghpages (0.4.0) + commonmarker (~> 0.23.7) + jekyll (~> 3.9.0) + jekyll-commonmark (~> 1.4.0) + rouge (>= 2.0, < 5.0) + jekyll-default-layout (0.1.5) + jekyll (>= 3.0, < 5.0) + jekyll-feed (0.17.0) jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.13.0) + jekyll-github-metadata (2.16.1) jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) + octokit (>= 4, < 7, != 4.4.0) jekyll-include-cache (0.2.1) jekyll (>= 3.7, < 5.0) jekyll-mentions (1.6.0) @@ -179,12 +162,12 @@ GEM rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) - jekyll-seo-tag (2.7.1) + jekyll-seo-tag (2.8.0) jekyll (>= 3.8, < 5.0) jekyll-sitemap (1.4.0) jekyll (>= 3.7, < 5.0) jekyll-swiss (1.0.0) - jekyll-target-blank (2.0.0) + jekyll-target-blank (2.0.2) jekyll (>= 3.0, < 5.0) nokogiri (~> 1.10) jekyll-theme-architect (0.2.0) @@ -232,70 +215,64 @@ GEM jekyll-twitter-plugin (2.1.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) + jemoji (0.13.0) + gemoji (>= 3, < 5) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) - kramdown (2.3.1) + kramdown (2.4.0) rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) - liquid (4.0.3) - listen (3.7.0) + liquid (4.0.4) + listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) + mini_portile2 (2.8.8) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.15.0) - multipart-post (2.1.1) - nokogiri (1.13.0-x86_64-darwin) + minitest (5.25.4) + nokogiri (1.13.10) + mini_portile2 (~> 2.8.0) racc (~> 1.4) - octokit (4.22.0) - faraday (>= 0.9) - sawyer (~> 0.8.0, >= 0.5.3) + octokit (4.25.1) + faraday (>= 1, < 3) + sawyer (~> 0.9) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.6) - racc (1.6.0) - rb-fsevent (0.11.0) - rb-inotify (0.10.1) + public_suffix (5.1.1) + racc (1.8.1) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby-enum (0.9.0) - i18n + rexml (3.4.1) + rouge (3.30.0) ruby2_keywords (0.0.5) - rubyzip (2.3.2) + rubyzip (2.4.1) safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.2) + sawyer (0.9.2) addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - simpleidn (0.2.1) - unf (~> 0.1.4) + faraday (>= 0.17.3, < 3) + simpleidn (0.2.3) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) + typhoeus (1.4.1) ethon (>= 0.9.0) - tzinfo (1.2.9) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.8) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) unicode-display_width (1.8.0) - webrick (1.7.0) - zeitwerk (2.5.3) + webrick (1.9.1) + zeitwerk (2.6.18) PLATFORMS - x86_64-darwin-19 + ruby DEPENDENCIES github-pages @@ -314,4 +291,4 @@ DEPENDENCIES webrick (~> 1.7) BUNDLED WITH - 2.3.4 + 1.17.2 diff --git a/_archives/categories/diversity.md b/_archives/categories/diversity.md new file mode 100644 index 0000000..07ac025 --- /dev/null +++ b/_archives/categories/diversity.md @@ -0,0 +1,6 @@ +--- +title: Diversity +category: "Diversity" +layout: archive-categories +permalink: "category/diversity" +--- diff --git a/_archives/categories/updates.md b/_archives/categories/updates.md new file mode 100644 index 0000000..431725f --- /dev/null +++ b/_archives/categories/updates.md @@ -0,0 +1,6 @@ +--- +title: Updates +category: "Updates" +layout: archive-categories +permalink: "category/updates" +--- diff --git a/_archives/tags/diversity.md b/_archives/tags/diversity.md new file mode 100644 index 0000000..eae2ee4 --- /dev/null +++ b/_archives/tags/diversity.md @@ -0,0 +1,6 @@ +--- +title: Diversity +tag: "Diversity" +layout: archive-tags +permalink: "tag/diversity" +--- diff --git a/_archives/tags/funding.md b/_archives/tags/funding.md new file mode 100644 index 0000000..333db5c --- /dev/null +++ b/_archives/tags/funding.md @@ -0,0 +1,6 @@ +--- +title: Funding +tag: "Funding" +layout: archive-tags +permalink: "tag/funding" +--- diff --git a/_archives/tags/inclusiveness.md b/_archives/tags/inclusiveness.md new file mode 100644 index 0000000..03bc0a7 --- /dev/null +++ b/_archives/tags/inclusiveness.md @@ -0,0 +1,6 @@ +--- +title: Inclusiveness +tag: "Inclusiveness" +layout: archive-tags +permalink: "tag/inclusiveness" +--- diff --git a/_archives/tags/internship.md b/_archives/tags/internship.md new file mode 100644 index 0000000..ca426f7 --- /dev/null +++ b/_archives/tags/internship.md @@ -0,0 +1,6 @@ +--- +title: Internship +tag: "Internship" +layout: archive-tags +permalink: "tag/internship" +--- diff --git a/_archives/tags/license.md b/_archives/tags/license.md new file mode 100644 index 0000000..90c79cd --- /dev/null +++ b/_archives/tags/license.md @@ -0,0 +1,6 @@ +--- +title: License +tag: "License" +layout: archive-tags +permalink: "tag/license" +--- diff --git a/_archives/tags/machine-learning.md b/_archives/tags/machine-learning.md new file mode 100644 index 0000000..d294c8e --- /dev/null +++ b/_archives/tags/machine-learning.md @@ -0,0 +1,6 @@ +--- +title: Machine Learning +tag: "Machine Learning" +layout: archive-tags +permalink: "tag/machine-learning" +--- diff --git a/_archives/tags/sponsor.md b/_archives/tags/sponsor.md new file mode 100644 index 0000000..01cc345 --- /dev/null +++ b/_archives/tags/sponsor.md @@ -0,0 +1,6 @@ +--- +title: Sponsor +tag: "Sponsor" +layout: archive-tags +permalink: "tag/sponsor" +--- diff --git a/_archives/years/2023.md b/_archives/years/2023.md new file mode 100644 index 0000000..7e1c0aa --- /dev/null +++ b/_archives/years/2023.md @@ -0,0 +1,6 @@ +--- +title: 2023 +year: "2023" +layout: archive-years +permalink: "year/2023" +--- diff --git a/_archives/years/2024.md b/_archives/years/2024.md new file mode 100644 index 0000000..6c2e468 --- /dev/null +++ b/_archives/years/2024.md @@ -0,0 +1,6 @@ +--- +title: 2024 +year: "2024" +layout: archive-years +permalink: "year/2024" +--- diff --git a/_config.yml b/_config.yml index 8d8cf09..9114172 100644 --- a/_config.yml +++ b/_config.yml @@ -8,13 +8,12 @@ minimal_mistakes_skin: "scikit" #air", "aqua", "contrast", "dark", "dirt", "neon # Site Settings title: "scikit-learn Blog" -email: social@scikit-learn.org +email: description: "The official blog of scikit-learn, an open source library for machine learning in Python." logo: assets/images/scikit-learn-logo.png favicon: assets/images/scikit-learn-logo.png baseurl: "/" url: "https://blog.scikit-learn.org" # the base hostname -twitter_username: scikit_learn github_username: scikit-learn repository: scikit-learn/blog @@ -37,15 +36,18 @@ author: - label: "GitHub" icon: "fab fa-fw fa-github-square" url: "https://github.com/scikit-learn" - - label: "Twitter" - icon: "fab fa-fw fa-twitter-square" - url: "https://twitter.com/scikit_learn" - - label: "YouTube" - icon: "fab fa-fw fa-youtube" - url: "https://youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw" - label: "LinkedIn" icon: "fab fa-fw fa-linkedin" url: "https://linkedin.com/company/scikit-learn/" + - label: "Bluesky" + icon: "" + url: "https://bsky.app/profile/scikit-learn.org" + - label: "Mastodon" + icon: "fab fa-brands fa-mastodon" + url: "https://fosstodon.org/@sklearn" + - label: "YouTube" + icon: "fab fa-fw fa-youtube" + url: "https://www.youtube.com/@scikit-learn" - label: "Facebook" icon: "fab fa-fw fa-facebook-square" url: "https://facebook.com/scikitlearnofficial/" @@ -54,21 +56,25 @@ author: url: "https://instagram.com/scikitlearnofficial/" + # Site Footer footer: links: - label: "GitHub" icon: "fab fa-fw fa-github-square" url: "https://github.com/scikit-learn" - - label: "Twitter" - icon: "fab fa-fw fa-twitter-square" - url: "https://twitter.com/scikit_learn" - - label: "YouTube" - icon: "fab fa-fw fa-youtube" - url: "https://youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw" - label: "LinkedIn" icon: "fab fa-fw fa-linkedin" url: "https://linkedin.com/company/scikit-learn/" + - label: "Bluesky" + icon: "" + url: "https://bsky.app/profile/scikit-learn.org" + - label: "Mastodon" + icon: "fab fa-brands fa-mastodon" + url: "https://fosstodon.org/@sklearn" + - label: "YouTube" + icon: "fab fa-fw fa-youtube" + url: "https://www.youtube.com/@scikit-learn" - label: "Facebook" icon: "fab fa-fw fa-facebook-square" url: "https://facebook.com/scikitlearnofficial/" @@ -100,6 +106,10 @@ paginate: 5 # amount of posts to show paginate_path: /page:num/ timezone: +params: + description: "scikit-learn Blog Posts" + plausible: + dataDomain: blog.scikit-learn.org # SEO Related #google_site_verification : diff --git a/_data/navigation.yml b/_data/navigation.yml index 70aab0b..7793f9f 100644 --- a/_data/navigation.yml +++ b/_data/navigation.yml @@ -14,12 +14,14 @@ main: - title: "User Guide" url: "https://scikit-learn.org/stable/user_guide" - title: "API" - url: "https://scikit-learn.org/stable/modules/classes" + url: "https://scikit-learn.org/stable/api/index" - title: "Examples" url: "https://scikit-learn.org/stable/auto_examples/index" + # Sidebar docs: + - title: Archives children: - title: "Category" @@ -37,6 +39,15 @@ docs: url: https://scikit-learn.org/stable/faq - title: "Code of Conduct" url: https://python.org/psf/conduct/ + - title: "Contribute to scikit-learn" + url : "https://scikit-learn.org/stable/developers/contributing.html" + - title: "Contribute to the blog" + url : "https://github.com/scikit-learn/blog" + + - title: Community + children: + - title: "Monthly Meeting Minutes" + url: https://github.com/scikit-learn/administrative/tree/master/monthly_meetings - title: Mailing List children: diff --git a/_includes/social-share.html b/_includes/social-share.html new file mode 100644 index 0000000..9817015 --- /dev/null +++ b/_includes/social-share.html @@ -0,0 +1,13 @@ +
+

{{ site.data.ui-text[site.locale].share_on_label | default: "Share on" }}

+ + LinkedIn + + Bluesky + + Mastodon + + Facebook + + +
\ No newline at end of file diff --git a/_layouts/default.html b/_layouts/default.html index c50e9f2..ec7cbce 100644 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -12,6 +12,8 @@ {% include head.html %} {% include head/custom.html %} + + diff --git a/_layouts/single.html b/_layouts/single.html index 852c4ca..c557ac1 100644 --- a/_layouts/single.html +++ b/_layouts/single.html @@ -22,6 +22,7 @@ {% if page.excerpt %}{% endif %} {% if page.date %}{% endif %} {% if page.last_modified_at %}{% endif %} + {% if page.featured-image %}{% endif %}
{% unless page.header.overlay_color or page.header.overlay_image %} diff --git a/_pages/sprints.md b/_pages/sprints.md index 7dae651..b11ef1e 100644 --- a/_pages/sprints.md +++ b/_pages/sprints.md @@ -7,8 +7,27 @@ author_profile: false ## Listing of Scikit-learn Sprints +- 2025 + - Online: [PyLadies Berlin](https://www.meetup.com/pyladies-berlin/events/305492587) (Feb 2025) + - Berlin, Germany: [PyLadies Berlin](https://www.meetup.com/pyladies-berlin/events/305397520/) (Feb 2025) + +- 2024 + - Paris, France: [PyData Paris](https://pydata.org/paris2024/sprints) (Sep 2024) + +- 2023 + - Zurich, Switzerland: [Python Sprints & Zurich WiMLDS](https://python-sprints.github.io/zurich/2023/11/07/scikit-learn-sprint.html) (Nov 2023) + - Berlin, Germany: [PyLadies Berlin](https://www.meetup.com/pyladies-berlin/events/294785222/) (Aug 2023) + - Berlin, Germany: [PyLadies Berlin](https://www.meetup.com/pyladies-berlin/events/294784938/) (Jul 2023) + - Paris, France: [scikit-learn Consortium](https://blog.scikit-learn.org/events/paris-dev-sprint/) (Jun 2023) + - Zurich, Switzerland: [Python Sprints Zurich](https://www.meetup.com/python-sprints-zurich/events/291718007/) (Mar 2023) + - Berlin, Germany: [PyLadies Berlin](https://www.meetup.com/pyladies-berlin/events/291249163/) (Feb 2023) + - Berlin, Germany: [PyLadies Berlin](https://www.meetup.com/pyladies-berlin/events/291248660/) (Feb 2023) - 2022 + - Paris, France: [PyLadies Paris](https://www.meetup.com/pyladiesparis/events/290003585/) (Dec 2022) + - Paris, France: [PyLadies Paris](https://www.meetup.com/pyladiesparis/events/289652024/) (Nov 2022) + - Paris, France: [PyLadies Paris](https://www.meetup.com/pyladiesparis/events/289471009/) (Nov 2022) + - Paris, France: [PyLadies Paris](https://www.meetup.com/pyladiesparis/events/289012536/) (Oct 2022) - Salta, Argentina: [SciPy Latin America](https://pythoncientifico.ar) (Sep 2022) - Basel, Switzerland: [EuroSciPy 2022](https://www.euroscipy.org/2022/index.html) (Sep 2022) - Dublin, Ireland: [EuroPython 2022](https://ep2022.europython.eu/mentored-sprint) (Jul 2022) @@ -34,7 +53,7 @@ author_profile: false - New York, NY: [NYC WiMLDS](http://wimlds.org/opensourcesprints-2/nyc-scikit-sprint-2019/) (Aug 24, 2019) - Austin, TX: [SciPy](https://www.scipy2019.scipy.org/sprints) (Jul 13-14, 2019) - Nairobi, Kenya: [WiMLDS sprint](http://wimlds.org/nairobi-scikit-sprint-2019/) (Jun 22, 2019) - - Paris, France: [core sprint, for advanced contributors](https://scikit-learn.fondation-inria.fr/en/scikit-learn-sprint-in-paris/) (Feb) + - Paris, France: [core sprint, for advanced contributors](https://scikit-learn.fondation-inria.fr/en/scikit-learn-sprint-in-paris/) (Feb 2019) - 2018 - New York, NY: [NYC WiMLDS](https://reshamas.github.io/highlights-from-the-2018-NYC-WiMLDS-scikit-sprint) (Sep 2018) - Austin, TX: [SciPy](http://gael-varoquaux.info/programming/sprint-on-scikit-learn-in-paris-and-austin.html) (open sprint, for new contributors) (Jul 2018) @@ -43,37 +62,38 @@ author_profile: false - Berkeley, CA: [UC Berkeley](https://github.com/scikit-image/scikit-image/wiki/UC-Berkeley-(BIDS)-sprint,-May-28-Jun-2-2018)(May 28 to Jun 2) - London, UK: ManAHL (April 21-22, 2018) - 2017 - - [Paris, France](http://gael-varoquaux.info/programming/scikit-learn-paris-sprint-2017.html) (Jun) - - New York, NY: [NYC WiMLDS](https://github.com/WiMLDS/scikit-sprint-nyc-2017/blob/master/README.md) (Mar) + - [Paris, France](http://gael-varoquaux.info/programming/scikit-learn-paris-sprint-2017.html) (Jun 2017) + - New York, NY: [NYC WiMLDS](https://github.com/WiMLDS/scikit-sprint-nyc-2017/blob/master/README.md) (Mar 2017) - 2016 - - SciPy: [Austin](https://scipy2016.scipy.org/ehome/146062/332969/) (Jul) + - SciPy: [Austin](https://scipy2016.scipy.org/ehome/146062/332969/index9a04.html?%26) (Jul 2016) - 2015 - - SciPy: [Austin](https://scipy2016.scipy.org/ehome/115969/292867/) (Jul) - - ODSC: San Francisco (Nov) - - Criteo: [Paris](https://twitter.com/GaelVaroquaux/status/656847270550310912) (Oct) - - PyData: Paris (Apr) + - SciPy: [Austin](https://scipy2015.scipy.org/ehome/115969/292867/index9a04.html?%26) (Jul 2015) + - ODSC: San Francisco (Nov 2015) + - Criteo: [Paris](https://twitter.com/GaelVaroquaux/status/656847270550310912) (Oct 2015) + - PyData: Paris (Apr 2015) - 2014 - - Euro SciPy, Cambridge (Aug) - - INRIA, Criteo, La paillasse, Tinyclues: [Paris](http://gael-varoquaux.info/programming/scikit-learn-2014-sprint-a-report.html) (Jul) - - Cloudera, SF (Feb) + - Euro SciPy, Cambridge (Aug 2014) + - INRIA, Criteo, La paillasse, Tinyclues: [Paris](http://gael-varoquaux.info/programming/scikit-learn-2014-sprint-a-report.html) (Jul 2014) + - Cloudera, SF (Feb 2014) - 2013 - - Paris (Jul) - - SciPy: [Austin](https://conference.scipy.org/scipy2013/sprint_detail.php?id=36) (Jun) + - [Paris, France](https://github.com/scikit-learn/administrative/blob/master/sprint_paris_2013/proposal.rst) (Jul 2013) + - Sponsors: [Télécom Paristech](https://www.telecom-paristech.fr/), [tinyclues](https://www.tinyclues.com/), [French Python Association](https://www.afpy.org/), [Fonds de la Recherche Scientifique](https://www.frs-fnrs.be) + - SciPy: [Austin](https://conference.scipy.org/scipy2013/sprint_detail.php?id=36) (Jun 2013) - 2012 - - EuroSciPy Bruxelles (Aug) - - PyCon France (Jul) - - SciPy: [Austin](http://conference.scipy.org/scipy2012/sprints/sprint_detail.php?id=15) (Jul) + - EuroSciPy Bruxelles (Aug 2012) + - PyCon France (Jul 2012) + - SciPy: [Austin](http://conference.scipy.org/scipy2012/sprints/sprint_detail.php?id=15) (Jul 2012) - 2011 - - NeurIPS: [Granada, Spain](http://gael-varoquaux.info/programming/scikit-learn-nips-2011-sprint-international-thanks-to-our-sponsors.html) (Dec) - - Euro SciPy: [Paris](http://fa.bianp.net/blog/2011/scikit-learn-euroscipy-2011-coding-sprint-day-one/) (Aug) - - Austin SciPy (Jul) - - Paris, Logilab (Apr) + - NeurIPS: [Granada, Spain](http://gael-varoquaux.info/programming/scikit-learn-nips-2011-sprint-international-thanks-to-our-sponsors.html) (Dec 2011) + - Euro SciPy: [Paris](http://fa.bianp.net/blog/2011/scikit-learn-euroscipy-2011-coding-sprint-day-one/) (Aug 2011) + - Austin SciPy (Jul 2011) + - Paris, Logilab (Apr 2011) - Boston MIT - 2010 - Paris, France: (Sep 2010) - Paris, France: (Jun 2010) - - Paris, France: (Mar 2010) - - **first release of scikit-learn** (Feb) + - [Paris, France](https://web.archive.org/web/20101118052247/http://fseoane.net/blog/2010/scikitslearn-coding-spring-in-paris/): (Mar 2010) + - **first release of scikit-learn** (Feb 2010) ## References - scikit learn wiki: [upcoming sprints](https://github.com/scikit-learn/scikit-learn/wiki/Upcoming-events) diff --git a/_posts/2019-01-11-wimlds-impact-report.md b/_posts/2019-01-11-wimlds-impact-report.md index 1e18b1d..d6df896 100644 --- a/_posts/2019-01-11-wimlds-impact-report.md +++ b/_posts/2019-01-11-wimlds-impact-report.md @@ -29,7 +29,7 @@ To address this gender imbalance for the scikit-learn library, Andreas Mueller, In 2017, **5** PRs were merged in: - **4** PRs were merged at the sprint - **1** PR was merged post-sprint without any follow-up - - The PR merged post-sprint was by [Sergul Aydore](https://twitter.com/sergulaydore). After attending this sprint, Sergul then went on to participate in the August 2018 [scikit-learn core sprint for advanced contributors](http://gael-varoquaux.info/programming/sprint-on-scikit-learn-in-paris-and-austin.html) in Paris. Sergul states: + - The PR merged post-sprint was by [Sergul Aydore](https://www.linkedin.com/in/sergül-aydöre-203193a/). After attending this sprint, Sergul then went on to participate in the August 2018 [scikit-learn core sprint for advanced contributors](http://gael-varoquaux.info/programming/sprint-on-scikit-learn-in-paris-and-austin.html) in Paris. Sergul states: >Participating in the March 2017 sprint helped me learn the basics and I was able to contribute to more complicated PRs in the August 2018 sprint. - No follow-up of open PRs was conducted. diff --git a/_posts/2022-03-28-maren-interview.md b/_posts/2022-03-28-maren-interview.md index 06efbd3..f6e5c37 100644 --- a/_posts/2022-03-28-maren-interview.md +++ b/_posts/2022-03-28-maren-interview.md @@ -3,6 +3,7 @@ title: "Interview with Maren Westermann: Extending the Impact of the scikit-lear date: March 28, 2022 categories: - Events + - Team tags: - Open Source - Sprints diff --git a/_posts/2022-05-12-pyconde-keynote-reshama.md b/_posts/2022-05-12-pyconde-keynote-reshama.md index 45ee830..b87be14 100644 --- a/_posts/2022-05-12-pyconde-keynote-reshama.md +++ b/_posts/2022-05-12-pyconde-keynote-reshama.md @@ -44,7 +44,7 @@ Reshama Shaikh is the Director of Data Umbrella. She is also on the Contributor ## Connecting - LinkedIn: [@reshamas](https://www.linkedin.com/in/reshamas/) -- Twitter: [@reshamas](https://twitter.com/reshamas) +- Bluesky: [@reshamas](https://bsky.app/profile/reshamas.bsky.social) - GitHub: [@reshamas](https://github.com/reshamas) - Medium: [@reshamas](https://medium.com/@reshamas) - Join the Data Umbrella [Meetup Group](https://www.meetup.com/data-umbrella/) diff --git a/_posts/2022-07-13-sprints-value.md b/_posts/2022-07-13-sprints-value.md index 71a0bd0..bc53b40 100644 --- a/_posts/2022-07-13-sprints-value.md +++ b/_posts/2022-07-13-sprints-value.md @@ -313,8 +313,8 @@ contributors (with a specific expertise) on advanced subjects when it is possibl ### Connecting and Supporting scikit-learn To connect with the scikit-learn project, these are the most active social media platforms: -- Twitter: [@scikit_learn](https://twitter.com/scikit_learn) - LinkedIn: [@scikit-learn](https://www.linkedin.com/company/scikit-learn/) +- Bluesky: [@scikit-learn.org](https://bsky.app/profile/scikit-learn.org) It is most welcome for users to “star” the code repository on GitHub: [scikit-learn/scikit-learn](https://github.com/scikit-learn/scikit-learn) diff --git a/_posts/2022-09-29-salta-sprint.md b/_posts/2022-09-29-salta-sprint.md index c4b7a62..eedc39c 100644 --- a/_posts/2022-09-29-salta-sprint.md +++ b/_posts/2022-09-29-salta-sprint.md @@ -22,7 +22,7 @@ As part of the event, we organized a [scikit-learn sprint](https://pythoncientif The main idea was to introduce the participants to the open source world and help them make their first contribution. The sprint event was an in-person event. -SciPy logo +SciPy logo ## Schedule - September 27, 2022 - **Pre-sprint** - 10:00 to 12:00 hs (UTC -3) diff --git a/_posts/2022-11-08-pandas-dataframe-output-for-sklearn-transformer.md b/_posts/2022-11-08-pandas-dataframe-output-for-sklearn-transformer.md new file mode 100644 index 0000000..29f52eb --- /dev/null +++ b/_posts/2022-11-08-pandas-dataframe-output-for-sklearn-transformer.md @@ -0,0 +1,44 @@ +--- +title: "Pandas DataFrame Output for sklearn Transformers" +date: November 8, 2022 +categories: + - Technical +tags: + - Performance +featured-image: pandas_output_sklearn_transformers.PNG + +postauthors: + - name: Sangam SwadiK + website: https://www.linkedin.com/in/sangam-swadi-k/ + image: sangam_swadik.jpg +--- + +
+ + {% include postauthor.html %} +
+ +## Video + + +## Upcoming feature in release 1.2 +Starting with the next release of [scikit-learn](https://github.com/scikit-learn/scikit-learn) (v1.2), pandas dataframe output will be available for all sklearn transformers! This will make running pipelines on dataframes much easier and provide better ways to track feature names. Previously, mapping a transformed output back into columns would be cumbersome as it might not be a one-to-one mapping in cases of complex preprocessing (e.g., polynomial features). + +The pandas dataframe output feature for transformers solves this by tracking features generated from pipelines automatically. The transformer output format can be configured explictly for either **numpy** or **pandas** output formats as shown in [sklearn.set_config](https://scikit-learn.org/dev/modules/generated/sklearn.set_config.html#sklearn.set_config) and the sample code below. +```python +from sklearn import set_config +set_config(transform_output = "pandas") +``` + +See the sample notebook, [pandas-dataframe-output-for-sklearn-transformer.ipynb](https://github.com/scikit-learn/blog/blob/main/assets/notebooks/sklearn-pandas-df-output.ipynb) and documentation for a more detailed example and usage. + +## Links to documentation and example notebook +- [Pandas output for transformers documentation](https://scikit-learn.org/dev/auto_examples/miscellaneous/plot_set_output.html#sphx-glr-auto-examples-miscellaneous-plot-set-output-py) +- [pandas-dataframe-output-for-sklearn-transformer.ipynb](https://github.com/scikit-learn/blog/blob/main/assets/notebooks/sklearn-pandas-df-output.ipynb) + + +## Reporting bugs +We'd love your feedback on this. In case of any suggestions or bugs, please report them at +[scikit-learn issues](https://github.com/scikit-learn/scikit-learn/issues) + +Thanks 🙏🏾 to maintainers: [**Thomas J. Fan**](https://github.com/thomasjpfan), [**Guillaume Lemaitre**](https://github.com/glemaitre) , [**Christian Lorentzen**](https://github.com/lorentzenchr) !! \ No newline at end of file diff --git a/_posts/2022-11-30-meekail-zain-interview.md b/_posts/2022-11-30-meekail-zain-interview.md new file mode 100644 index 0000000..2f8efe0 --- /dev/null +++ b/_posts/2022-11-30-meekail-zain-interview.md @@ -0,0 +1,99 @@ +--- +title: "Interview with Meekail Zain, scikit-learn Team Member" +date: November 30, 2022 +categories: + - Team +tags: + - Open Source +featured-image: meekail-zain-interview.png + +postauthors: + - name: Reshama Shaikh + website: https://reshamas.github.io + image: reshama_shaikh.jpeg + - name: Meekail Zain + website: https://www.linkedin.com/in/meekail-zain-02a412a2/ + image: meekail-zain.jpg +--- + +
+ + {% include postauthor.html %} +
+ +Posted by [Sangam SwadiK](https://www.linkedin.com/in/sangam-swadi-k/) + +Meekail Zain is a computer science PhD student at University of Georgia (USA), a member of Quinn Research Group and a software engineer at Quansight. Meekail officially joined the scikit-learn team as a maintainer in October 2022. + +1. __Tell us about yourself.__ + + I’m currently attending the University of Georgia, pursuing a PhD in computer science. My area of research predominantly focuses on deep learning, generative modeling, and statistical approaches to clustering. I’m in my third year, and at the time of writing about to begin my comprehensive exams. + + - GitHub: [@Micky774](https://github.com/Micky774) + - LinkedIn: [@meekail-zain](https://www.linkedin.com/in/meekail-zain-02a412a2/) + +1. __How did you first become involved in open source and scikit-learn?__ + + I first got involved as a user, as most people do. NumPy was a recurring day-to-day library for me, and scikit-learn was a de-facto necessity for several graduate courses. Originally I never really imagined being able to get to a point where I could affect change in these libraries since they seemed so well-established! + +1. __We would love to learn of your open source journey.__ + + My journey really kicked off when I went to work at Quansight and received funding through the [NASA Roses grant](https://numfocus.medium.com/numfocus-projects-receive-nasa-grants-deee374e7a57) to be able to dedicate time to contributing to scikit-learn. It was a huge jump from what I had known up until that point. I learned Python very informally in order to be able to use PyTorch to develop/deploy models for my research, and had little-to-no experience with things like continuous integration or strong API. At first I felt incredibly intimidated and unqualified, but at the same time absolutely thrilled that I was in a position to learn so many new things! + * + I started working on really simple changes to get used to the contribution workflow — things like removing excess whitespace and fixing typos + * + — and then graduated to slightly more complex tasks. Eventually I got to the point where I started to “understand” small corners of the codebase and could actually offer help on new issues because of that familiarity. After that,* I started reviewing others’ pull requests (PRs) and offering feedback in an unofficial capacity*, as well as taking on more challenging tasks across the codebase. That process of growth and escalation is still ongoing, and truly I hope it never ends. + +1. __To which OSS projects and communities do you contribute?__ + + NumPy, scikit-learn, and scipy. Right now it is heavily skewed towards scikit-learn with numpy being second most, but I’m hoping to take some more time to work on scipy in the near future! + +1. __What advice or tips you have for people starting out in your field of work?__ + + *Find a way to enjoy the feeling of being surrounded by things that you haven’t yet mastered*. If you aim for growth — and indeed I think we all should — then you’ll find that you spend the majority of your time surrounded by things that you don’t quite understand, and the natural reaction to that is frustration and intimidation. If you can somehow convince yourself to also be excited by such an environment, you’ll find yourself growing every single day. Nobody starts off knowing everything :) + +1. __What do you find alluring about OSS?__ + + This is a tough one, there are many amazing points. If I had to select just a few, it would be (in no particular order): + - The growth potential + - The community + - The impact + + I’ve already discussed the growth potential so I’ll leave it at that. + + The **community** is fantastic as well! On every project the community base has its own unique personality of sorts, and they are all wonderful! It’s amazing being able to see recurring users that post interesting issues, or take a stab at opening more complex PRs (pull requests). There’s a strong sense of companionship with the people that are also trying to improve the same project as you! It’s akin to a very niche club in high school. It’s a wonderful experience finding people obsessed with the same cool project as you are. + + Finally, the **impact**. At the end of the day, the work we do has some serious consequences. Each project is essential to so many different workflows and enables brilliant researchers and software engineers to build complex systems and solutions to cutting edge problems. It’s sometimes surreal to think about how essential some of these projects really are. + +1. __What pain points do you observe in community-led OSS?__ + + *Consensus is difficult*. This is a double-edged sword, since it carries some benefits too. With community-lead OSS, changes at every scale need to meet _some_ kind of consensus.* This ensures that the changes are well thought out and provides a layer of safety since the chance of uncaught mistakes propagating goes down with the number of people carefully reviewing changes* (for the most part). + + For example, in scikit-learn a PR with changes to code needs to meet a lazy consensus where two official reviewers (currently just core developers) explicitly approve, and no other official reviewer officially disapproves. Going a bit further up, a new feature request in a project could require the consensus of several core developers that are well-versed in the topic area. Large systemic changes manifest in the form of [SLEPs](https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep_template.html) (scikit-learn enhancement proposals) which require a ⅔ consensus across all core developers. Above even that, there are cross-community discussions where the idea of a “consensus” itself isn’t always really clear. + + This system is a critical one, but there are important issues intrinsic to it that need to be addressed. For example, who gets to contribute to a consensus at each scale? What qualifications does one need, and how do we codify that? There’s also the intrinsic tradeoff where the stronger the consensus required, the less likely it is that changes will be adopted. This is by design since wide-reaching changes need to be held to high standards, but it does also mean that occasionally even for narrow-scoped problems no solution will be reached despite options being raised that are better than the status quo. + +1. __If we discuss how far OS has evolved in 10 years, what would you like to see happen?__ + + I can’t speak to its evolution in the past 10 years, since I am still fairly new to OSS overall, but *I would like to see systematic data-driven analysis on contributor’s needs*. Different OSS projects have issued contributor surveys in the past, but in general I think a lot of emphasis is placed on the feedback given from users in meta issues or over community calls. While that is definitely helpful, there’s a lot of extrapolation that takes place when projects try to determine the needs of their contributor base like this. + + Some questions I would love to see studied include: + - What distribution does the expertise of the contributor base follow? + - What are the greatest bottlenecks at each level of expertise? + - Aside from expertise, are there other socio-economic or general demographics that exhibit consistent bottlenecks? (e.g. access to hardware) + - How do we create informed and effective DEI policies from this information? + + * + OSS projects thrive and prosper based on their community, so I would love to see more systematic research on community needs and pain points.* + +1. __What are your favorite resources, books, courses, conferences, etc?__ + + I absolutely adore [“Probability and Statistics” by Evans and Rosenthal](https://www.utstat.toronto.edu/mikevans/jeffrosenthal/). It does a fantastic job of constructing a lot of otherwise daunting statistical concepts from very elementary ideas. It is my favorite book to recommend to eager students that do not have a rigorous foundation in probability and statistics, since this book does a great job of building up the reader’s intuition and making everything feel natural and derived, rather than arbitrarily defined. + + Regarding conferences, I have to go with [SciPy](https://conference.scipy.org/)! I was definitely scared going into the conference thinking that I would be the least-qualified person in every room and that I’d have nothing to talk about. I realized very quickly that there is _always_ something to talk about, and qualifications don’t matter. It’s a gathering of super passionate people that are each eager to talk about the things that interest them, so regardless of whether you’re an expert or a beginner, they will _happily_ explain things to you. Every single attendee has some area, no matter how specific, that they can talk about for hours. That genuine interest and excitement felt rejuvenating and reminded me why I love OSS so much. + +1. __What are your hobbies, outside of work and open source?__ + + I really enjoy hiking, camping and playing DnD (Dungeons & Dragons)! Camping especially is an important hobby for me since whenever I have a computer in reach I feel inclined to check my GitHub notifications, so the occasional total disconnect for a weekend is a fantastic tool for me to give myself a break with no pressure of “I _could_ work on that new feature right now…” + + If you have ever had difficulty with relaxing because of that little voice in your head that says “How dare you relax? You could be doing _this_ and _that_ right now!” then I highly recommend going camping, even just for one night! When that voice strikes during camping, I retort “Ah but you see, I don’t have my laptop, so I _can’t_ work on that right now. All I can do right now is relax.” and suddenly the anxiety washes away :) \ No newline at end of file diff --git a/_posts/2023-07-11-nvidia-is-a-new-sponsor.md b/_posts/2023-07-11-nvidia-is-a-new-sponsor.md new file mode 100644 index 0000000..4a619e7 --- /dev/null +++ b/_posts/2023-07-11-nvidia-is-a-new-sponsor.md @@ -0,0 +1,46 @@ +--- +title: "NVIDIA Is A New Sponsor Of The Scikit-Learn consortium at the Inria Foundation" +date: November 14, 2023 + +categories: + - Funding +tags: + - Sponsor + +featured-image: NVIDIAxsklearn.jpg + +postauthors: + - name: NVIDIA + website: https://developer.nvidia.com/gpu-accelerated-libraries + image: "nvidia-logo.png" + - name: François Goupil + email: francois.goupil@inria.fr + website: https://github.com/francoisgoupil + image: "francois_goupil.jpeg" +--- +
+ + {% include postauthor.html %} +
+ +*Sponsored blog post* + +We are thrilled to announce that [NVIDIA](https://www.nvidia.com) has joined the [scikit-learn consortium](https://scikit-learn.fondation-inria.fr/) as a corporate partner. As a leading provider of GPU-accelerated computing solutions, we at NVIDIA recognize the importance of machine learning and the role it plays in the growth of many industries and areas of science. Our partnership with the scikit-learn consortium demonstrates our commitment to supporting the development and advancement of open-source software in the machine learning community. + +
+ +
+ +[Scikit-learn](https://scikit-learn.org/stable/) is a popular open-source Python library for machine learning. One of the strengths of scikit-learn is its ease of use and well-defined API. This makes it a favorite tool among data scientists and machine learning practitioners. Thanks to its active community and continuous development, scikit-learn is constantly evolving and improving. + +At NVIDIA, we believe that investing in open-source projects like scikit-learn is important. Afterall, it is a central component of the modern data stack in both science and industry. By financially supporting the scikit-learn consortium, we are contributing to the long-term sustainability of scikit-learn and helping to ensure that it remains an easy to use, reliable and valuable tool for years to come. Furthermore, we hope to help advance the project's development, improve its performance, and enhance its capabilities for machine learning on GPUs. + +Our partnership with the scikit-learn consortium will also enable us to collaborate more closely with the scikit-learn community, and provide us with insights into how we can improve NVIDIA’s [RAPIDS open-source libraries](https://developer.nvidia.com/rapids) to better serve their needs. We are committed to working with the foundation to ensure that scikit-learn remains a powerful and easy to use machine learning library that meets the needs of data science practitioners in science and industry. + +NVIDIA’s commitment to scikit-learn goes beyond financial support. We have hired [Tim Head](https://betatim.github.io), an experienced open-source maintainer, to work full-time on the project. This is not Tim’s first open-source rodeo. He has previously contributed to several high-profile open-source projects, including Project Jupyter. His focus will be reviewing pull requests and coordinating the development of large features. Tim was recently elected as a core maintainer of scikit-learn. His expertise and experience will be invaluable in ensuring the continued growth and success of the project. + +In summary, NVIDIA’s partnership with the scikit-learn consortium is an important step in our ongoing commitment to support the development and growth of open-source software in the machine learning community. We are excited to work with the foundation and the community of contributors to help advance the capabilities of scikit-learn and accelerate the development of machine learning applications. + +AI helped write this blog post! diff --git a/_posts/2023-09-12-paris-dev-sprint.md b/_posts/2023-09-12-paris-dev-sprint.md new file mode 100644 index 0000000..7851dff --- /dev/null +++ b/_posts/2023-09-12-paris-dev-sprint.md @@ -0,0 +1,101 @@ +--- +title: "scikit-learn 2023 In-person Developer Sprint in Paris, France" +date: September 10, 2023 + +categories: + - Events +tags: + - Sprints + - Community +featured-image: 2023-paris-dev-sprint.png + +postauthors: + - name: Reshama Shaikh + website: https://reshamas.github.io + image: reshama_shaikh.jpeg + - name: François Goupil + email: francois.goupil@inria.fr + website: https://github.com/francoisgoupil + image: "francois_goupil.jpeg" +--- +
+ {% include postauthor.html %} +
+ + +During the week of June 19 to 23, 2023, the scikit-learn team held its first developers sprint since 2019! The sprint took place in Paris, France at the Dataiku office. The sprint event was an in-person event and had 32 participants. + +The following [scikit-learn team members](https://scikit-learn.org/stable/about.html) joined the sprint: + +1. Adrin Jalali +1. Arturo Amor Quiroz +1. François Goupil (@francoisgoupil) +1. Frank Charras (@fcharras) +1. Gael Varoquaux (@GaelVaroquaux) +1. Guillaume Lemaitre (@glemaitre) +1. Jérémie du Boisberranger (@jeremiedbb) +1. Joris Van den Bossche +1. Julien Jerphanion (@jjerphan) +1. Loïc Estève +1. Maren Westermann +1. Olivier Grisel (@ogrisel) +1. Roman Yurchak +1. Thomas Fan +1. Tim Head (@betatim) + +The following community members joined the sprint: + +1. Alexandre Landeau +1. Alexandre Vigny +1. Chaine San Buenaventura +1. Camille Troillard +1. Denis Engemann +1. Franck Charras +1. Harizo Rajaona +1. Ines (intern at Dataiku) +1. Jovan Stojanovic +1. Leo Dreyfus-Schmidt +1. Léo Grinsztajn +1. Lilian Boulard +1. Louis Fouquet +1. Riccardo Cappuzzo +1. Samuel Ronsin +1. Vincent Maladière +1. Yann Lechelle + + +
+ group of people who participated in the sprint +
+ scikit-learn Developer Sprint, Paris, June 2023; Photo credit: Copyright: Inria / Photo B. Fourrier, June 2023; (from left to right, back to front): +Last Row: Denis Engemann, Riccardo Cappuzzo, François Goupil, Tim Head, Guillaume Lemaitre, Louis Fouquet, Jérémie du Boisberranger, Frank Charras, Léo Grinsztajn, Arturo Amor Quiroz. +Middle Row: Thomas Fan, Lilian Boulard, Gaël Varoquaux, Ines, Jovan Stojanovic, Chaine San Buenaventura. +First Row: Olivier Grisel, Harizo Rajaona, Vincent Maladière. +
+
+ +## Sponsors +- Dataiku provided the space and some of the food, as well as all of the coffee. +- The scikit-learn consortium organized the sprint, paid for the lunch, the travel and accommodation expenses. + +## Topics covered at the sprint +- PR #13649: [Monotonic constraints for Tree-based models](https://github.com/scikit-learn/scikit-learn/pull/13649) +- Discussed the vision/future directions for the project. What is important to keep the project relevant in the future. +- Should we share some points beyond the vision statement? +- Thomas F will try and create a vision statement +- Discussed what people are keeping an eye on with a two year time scale in mind in terms of technology and developments that are relevant. +- Tim: keep improving our documentation (not just expanding it but also “gardening” to keep it readable) +- Tim: increase active outreach and communication about new features/improvements and other changes. A lot of cool things in scikit-learn are virtually unknown to the wider public (e.g. Hist grad boosting being on par with lightgbm in terms of performance, …) + + +### What is next? + +We are discussing co-locating with OpenML in 2024 in Berlin, Germany to organize another developers' sprint. + + +
+ group of people who participated in the sprint +
+ scikit-learn Developer Sprint, Paris, June 2023; Photo credit: Copyright Inria / Photo B. Fourrier, June 2023; (from left to right): Thomas Fan, Olivier Grisel +
+
diff --git a/_posts/2023-27-11-mentoring.md b/_posts/2023-27-11-mentoring.md new file mode 100644 index 0000000..008f903 --- /dev/null +++ b/_posts/2023-27-11-mentoring.md @@ -0,0 +1,57 @@ +--- +title: "My mentored internship at scikit-learn" +date: November 27, 2023 + +categories: + - Diversity +tags: + - Internship + - Diversity + - Inclusiveness + + +postauthors: + - name: Stefanie Senger + email: stefanie.senger@posteo.de + website: https://github.com/StefanieSenger + image: "stefanie-senger.jpeg" + + - name: François Goupil + email: francois.goupil@inria.fr + website: https://github.com/francoisgoupil + image: "francois_goupil.jpeg" +--- +
+ + {% include postauthor.html %} +
+ +## How it is to be an Intern at scikit-learn + +My name is Stefanie Senger, and I recently concluded a five-month mentored internship at scikit-learn, that had been funded by NumFocus as a Small Development Grant with a clear focus on fostering diversity in open-source projects. The idea to couple a grant with mentorship traces back to Maren Westermann's initiative. She envisioned a pathway to integrate more female coders into scikit-learn through internships and support. Scikit-learn would profit from fresh perspectives and some disruption. I was the guinea pig for an initial experiment, as Maren later told me. + + +## Starting the Internship + +As someone transitioning from a non-technical background to coding, working on scikit-learn was a big thing for me. I had participated in and taught at a data science boot camp, searching diligently for a first role in the field. I never doubted I could tackle more difficult tech challenges over time, but I knew there was much to learn. Scikit-learn had a heavy-tech aura to me, and when I discovered the internship ad, I just thought: this. I was genuinely taken aback when accepted for the role, though. There are many more experienced people looking for such an opportunity, after all. + +When I got to know better both my mentors, Adrin Jalali and Guillaume Lemaitre, it became quickly clear that only effort was required, and I could ask them any question along the way. I felt very welcome in the community, also by the other people I interacted with on GitHub. + + +## What I Worked on + +I began by working on documentation and examples such as "Multi-class AdaBoosted Decision Trees," to make those more comprehensive and helpful for users. Then some maintenance tasks on the code that were repetitive so I could find out what to do from other contributors' pull requests. Guillaume discovered that one AdaBoost algorithm required deprecation, and it fell on me to execute this. I had never looked at such a huge code base with so many layers of abstraction, and I had to learn quite some more Python to be able to go ahead. I even got the opportunity to present an "Intro to scikit-learn" workshop at EuroSciPy, the European conference on the scientific use of Python in Basel, where I also got to know many other contributors and people from the scikit-learn team at Inria. + +Adrin introduced me to the challenging task of implementing a new feature for metadata routing, developed over many years by the scikit-learn community. It allows users to set metadata, such as sample weights, in meta estimators, that can be routed to sub-estimators and other algorithms that are able to consume it. This was partly uncharted territory and meant finding solutions where there was no predefined path and adapting tests to match the expected behavior. In the last two months of my internship, I implemented metadata routing into some meta-estimators, which was tremendously difficult but, once accomplished, has nourished my professional confidence since. + + +## Mentorship in Action + +Let me describe how the mentoring worked because Guillaume's and Adrin's support was invaluable. They would both literally drop their tasks when I had questions and right away hint me in the right direction. I met Adrin twice a week, and we would co-work while I would throw questions at him. Guillaume was available remotely, and I knew he would jump into a video call with me when I needed help. They both gave reviewing my PRs a priority, and I got feedback on my work regularly. + +It was essential to have mentors signaling that it's okay to be learning and to propose tasks to me. If I had come into the project individually, I might have hesitated to take on most of the issues I ended up working on, fearing that my skills were insufficient and that I would hinder the progress of the project rather than help it. The mentoring setting gave me a justification to try things that I wasn't sure if I could do. + + +## Becoming a Community Member + +Looking ahead, I will continue contributing to scikit-learn. As I've gotten to know quite a few of the other contributors in person, I now feel part of the community. I know they care about values like openness and diversity, that I share, and while acknowledging the complexity of the code base, I know what I can learn from taking on issues and the sense of accomplishment when merging my solution into the main branch. And I love contributing to something meaningful, which is something I had always sought. diff --git a/_posts/2024-05-04-authorship-info.md b/_posts/2024-05-04-authorship-info.md new file mode 100644 index 0000000..cd29c1d --- /dev/null +++ b/_posts/2024-05-04-authorship-info.md @@ -0,0 +1,60 @@ +--- +#### Blog Post Template #### + +#### Post Information #### +title: "Note on Inline Authorship Information in scikit-learn" +date: May 4, 2024 + +#### Post Category and Tags #### +# Format in titlecase without dashes (Ex. "Open Source" instead of "open-source") +categories: + - Updates +tags: + - Open Source + - Machine Learning + - License + +#### Featured Image #### +featured-image: BSD_watermark.svg + +#### Author Info #### +# Can accomodate multiple authors +# Add SQUARE Author Image to /assets/images/author_images/ folder +postauthors: + - name: Adrin Jalali + website: https://adrin.info/ + image: adrin-jalali.jpeg +--- +
+ + {% include postauthor.html %} +
+ +Historically, scikit-learn's files have included authorship information similar +to the following format: + +```python +# Authors: Author1, Author2, ... +# License: BSD 3 clause +``` + +However, after a series of discussions which you can see in detail in [this +issue]( https://github.com/scikit-learn/scikit-learn/pull/28799), we could list +the following caveats to the status quo: + +- Authorship information was not up-to-date and in most cases, but not always, + reflect the original authors of the file; +- It was unfair to all other contributors who have been contributing to the + code-base; +- One can check the real authors and the history of the authors of any part of + the code-base using `git blame` and other `git` tools. + +Therefore we came to the conclusion to standardize all authorship information to +mention "The scikit-learn developers", and have the license notice as: + +```python +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause +``` + +The change is to happen gradually in the coming months after April 2024. diff --git a/_posts/2024-07-18-yao-interview.md b/_posts/2024-07-18-yao-interview.md new file mode 100644 index 0000000..e1fa60b --- /dev/null +++ b/_posts/2024-07-18-yao-interview.md @@ -0,0 +1,64 @@ +--- +title: "Interview with Yao Xiao, scikit-learn Team Member" +date: July 18, 2024 +categories: + - Team +tags: + - Open Source +featured-image: + +postauthors: + - name: Reshama Shaikh + website: https://reshamas.github.io + image: reshama_shaikh.jpeg + - name: Yao Xiao + website: https://charlie-xiao.github.io/ + image: yao-xiao.jpeg +--- + +
+ + {% include postauthor.html %} +
+ +Yao Xiao recently earned his undergraduate degree in mathematics and computer science. He will be pursuing a Master’s degree in Computational Science and Engineering at Harvard SEAS. Yao joined the scikit-learn team in February 2024. + +1. __Tell us about yourself.__ + + My name is Yao Xiao and I live in Shanghai, China. At the time of interview I have just got my Bachelor’s degree in Honors Mathematics and Computer Science at NYU Shanghai, and I’m going to pursue a Master’s degree in Computational Science and Engineering at Harvard SEAS. My current research interests are in networks and systems (e.g. sys4ml and ml4sys), but this may change in the future. + + - GitHub: [@Charlie](https://github.com/Charlie-XIAO) + - LinkedIn: [@yao-xiao](https://www.linkedin.com/in/yao-xiao-200073244/) + - Website: [https://charlie-xiao.github.io](https://charlie-xiao.github.io/) + +1. __How did you first become involved in open source and scikit-learn?__ + + In my junior year I took a course at NYU Courant called Open Source Software Development where we needed to make contributions to an open source software as our final project - and I chose scikit-learn. + +1. __We would love to learn of your open source journey.__ + + I was lucky to get involved in a pretty easy meta-issue when I first started contributing to scikit-learn. I made quite a few PRs towards that issue, familiarizing myself with the coding standards, contributing workflow etc., and during which I gradually explored the codebase and learned a lot from maintainers how to write better code. After that meta-issue was completed, I decided to continue contributing since I enjoyed the experience, and I started looking through the open issues, tried reproducing and investigating them, then opened PRs for those that I was able to solve. It is the process of familiarizing with more parts of the codebase, being able to make more PRs, so on and so forth. While contributing to scikit-learn, sometimes there are also issues to solve upstream, so I also had opportunities to contribute to projects like pandas and pydata-sphinx-theme. Up till today I’m still far from familiar with the entire scikit-learn project, but I will definitely continue the amazing open-source journey. + +1. __To which OSS projects and communities do you contribute?__ + + I have contributed to scikit-learn, pandas, pydata-sphinx-theme, sphinx-gallery. I’m also writing some small softwares that I decide to make open source. + +1. __What do you find alluring about OSS?__ + + It is amazing to feel that my code is being used by so many people all around the world through contributing to open source projects. Well it might be inappropriate to say “my code”, but I do feel like making some actual contributions to the community instead of just writing code for myself. Also OSS makes me care about code quality and so on instead of merely making things “work”, which is very important for programmers but not really taught in school. + +1. __What pain points do you observe in community-led OSS?__ + + Collaboration can lead to better code but also slows down the development process. Especially when there are not enough reviewers around, issues and PRs can easily get stale or forgotten. But I would say it’s more like a tradeoff rather than a pain point. + +1. __If we discuss how far OS has evolved in 10 years, what would you like to see happen?__ + + I couldn’t say about the past 10 years since I’ve only been involved for about one and a half years, but regarding the scientific Python ecosystem I would like to see better coordination across projects (which is already happening). For instance a common interface for array libraries and dataframe libraries would allow downstream dependents to easily provide more flexible support for different input/output types, etc. And as a Chinese I would also hope that open source can thrive in my country some day as well. + +1. __What are your favorite resources, books, courses, conferences, etc?__ + + As for physical books I would recommend *The Pragmatic Programmer* by Andy Hunt and Dave Thomas, and *Refactoring: Improving the Design of Existing Code* by Martin Fowler and Kent Back. As for courses I like MIT’s *The Missing Semester of Your CS Education*. In particular about learning Python, *The Python Tutorial* in the official Python documentation is good enough for me. By the way I want to mention that **documentations** of most languages and popular packages are very nice and they are the best place to learn the most up-to-date information. + +1. __What are your hobbies, outside of work and open source?__ + + I would say my largest hobby is programming (not for school, not for work, just for fun). I’ve recently been fascinated with [Tauri](https://v2.tauri.app/) and wrote a lot of small desktop applications for myself in my spare time. Apart from this I also love playing the piano and I’m an anime lover, so I often listen to or play piano versions of anime theme songs (mostly arranged by [Animenz](https://www.animenzpiano.com/)). diff --git a/_posts/2024-07-24-adam-li-interview.md b/_posts/2024-07-24-adam-li-interview.md new file mode 100644 index 0000000..c8d807f --- /dev/null +++ b/_posts/2024-07-24-adam-li-interview.md @@ -0,0 +1,87 @@ +--- +title: "Interview with Adam Li, scikit-learn Team Member" +date: July 24, 2024 +categories: + - Team +tags: + - Open Source +featured-image: /assets/images/posts_images/adam-li-interview.png + +postauthors: + - name: Reshama Shaikh + website: https://reshamas.github.io + image: reshama_shaikh.jpeg + - name: Adam Li + website: https://adam2392.github.io/ + image: adam-li.jpeg +--- + +
+ + {% include postauthor.html %} +
+ + +BIO: Adam is currently a Postdoctoral Research Scientist at Columbia University in the Causal Artificial Intelligence Lab, directed by [Dr. Elias Bareinboim](https://causalai.net/). He is an [NSF-funded Computing Innovation Research Fellow](https://cifellows2021.org/2021-class/). He did his PhD in biomedical engineering, specializing in computational neuroscience and machine learning at Johns Hopkins University working with Dr. Sridevi V. Sarma in the [Neuromedical Control Systems group](https://sarmalab.icm.jhu.edu/). He also jointly obtained a MS in Applied Mathematics and Statistics with a focus in statistical learning theory, optimization and matrix analysis. He was fortunate to be a [NSF-GRFP fellow, Whitaker International Fellow](https://icm.jhu.edu/2017/03/20/adam-li-selected-for-nsf-graduate-research-and-whitaker-international-fellowships/#.YH2ZT6lKj0o), [Chateaubriand Fellow](https://icm.jhu.edu/2017/06/16/adam-li-icm-phd-student-selected-for-chateaubriand-fellowship/#.YH2Zi6lKj0o) and [ARCS Chapter Scholar](https://icm.jhu.edu/2020/07/20/adam-li-icm-phd-student-receives-arcs-scholarship/#.YH2ZbKlKj0o) during his time at JHU. Adam officially joined the scikit-learn team as a maintainer in July 2024. + +- GitHub: [@adam2392](https://github.com/adam2392) +- LinkedIn: [@adam2392](https://www.linkedin.com/in/adam2392/) +- Website: [https://adam2392.github.io](https://adam2392.github.io/) + +Link to scikit-learn contributions (issues, pull requests): +- [FEA Add missing-value support for ExtaTreeClassifier and ExtaTreeRegressor](https://github.com/scikit-learn/scikit-learn/pull/27966) +- [DOC Fix tree explanation of tree_.value in example](https://github.com/scikit-learn/scikit-learn/pull/29331) +- [ENH Enable prediction of isolation forest in parallel](https://github.com/scikit-learn/scikit-learn/pull/28622) +- [ENH Adding estimators_samples_ attribute to forest models](https://github.com/scikit-learn/scikit-learn/pull/26736) +- [FEA SLEP006: Metadata routing for SelfTrainingClassifier](https://github.com/scikit-learn/scikit-learn/pull/28494) +- [FEAT SLEP006 permutation_test_score to support metadata routing](https://github.com/scikit-learn/scikit-learn/pull/29266) +- [FEA Categorical split support for DecisionTree*, ExtraTree*, RandomForest* and `ExtraTrees* #29437](https://github.com/scikit-learn/scikit-learn/pull/29437) +- Issue: [Adding Oblique Trees (Forest-RC) to the Cythonized Tree Module](https://github.com/scikit-learn/scikit-learn/issues/20819) + +1. __Tell us about yourself.__ + + I currently live in New York City, where I work on theoretical and applied AI research through the lens of causal inference, statistical modeling, dynamical systems and signal processing. My current research is focused on telling a causal story, specifically in the case one has multiple distributions of data from the same causal system. For example, one may have access to brain recordings from monkeys and humans. Given these heterogeneous datasets, I am interested in answering: what causal relationships can we learn. This is known as the causal discovery problem, where given data, one attempts to learn what causes what. Another problem that I work on that is highly relevant to generative AI is the problem of causal representation learning. Here, I develop theory and train deep neural networks to understand causality among latent factors. Specifically, we demonstrate how to leverage multiple datasets and a causal neural network to generate data that is causally realistic. This can enable more robust data generation from general latent variable models. + + +1. __How did you first become involved in open source and scikit-learn?__ + + I first got involved in open source as a user. I was making the switch from Matlab to Python and started using packages like numpy and scipy pretty regularly. In my PhD research, I dealt with a lot of electrophysiological data (i.e. EEG brain recordings). I was writing hundreds of lines of code to load and preprocess data, and it was always changing based on different constraints. That was when I discovered [MNE-BIDS](https://github.com/mne-tools/mne-bids), a Python package within the MNE framework for reading and writing brain recording data in a structured format. This changed my life because now my preprocessing and data loading code was a few lines of code that adhered to an open standard tested by thousands of researchers. I realized the value of open source, and began contributing in my spare time. + + +1. __We would love to learn of your open source journey.__ + + I first started contributing to open-source in the [MNE](https://github.com/mne-tools) organization. This package implements data structures for the processing and analysis of neural recording data (e.g. MEG, EEG, iEEG data). I contributed over 70 pull requests in the MNE-BIDS package, and subsequently was invited to be a maintainer for [MNE-BIDS](https://github.com/mne-tools/mne-bids) and [MNE-Python](https://github.com/mne-tools/mne-python). Later one, I participated in a Google Summer of Code to port the connectivity submodule within MNE-Python to a new package, known as [MNE-Connectivity](https://github.com/mne-tools/mne-connectivity). I added new data structures, and algorithms for the sake of improving the feature developments for connectivity algorithms among neural recording data. Later on, I also worked with a team on porting a neural network architecture from Matlab to the MNE framework to automatically classify ICA derived components. This became known as [MNE-ICALabel](https://github.com/mne-tools/mne-icalabel). These experiences gave me the experience necessary to work in a large asynchronous team environment that is common in OSS. It also taught me how to begin contributing to an OSS project. This led me to scikit-learn. + + I first got involved in scikit-learn as a user, who was heavily interested in the decision tree model in scikit-learn (random forest, randomized trees). Here, I was interested in contributing a [new oblique decision tree model](https://github.com/scikit-learn/scikit-learn/issues/20819) that was a generalization of the existing random forest model. However, the code was not easily added to scikit-learn, and currently the decision to include it is inconclusive. Throughout this process, I learned about the challenges and intricacies of maintaining such a large OSS project as scikit-learn. It is not trivial to simply add new features to a large OSS project because code comes with a maintenance cost, and should fit with the current internal design. At this point in time, there were very few maintainers that were able to maintain the tree submodule, and as such new features are included conservatively. + + I was eager to improve the project to enable more exciting features for the community, so I began contributing to scikit-learn starting with smaller issues such as documentation improvements, or minor bug fixes to get acquainted with the codebase. I also refactored various Cython code to begin upgrading the codebase, especially in the tree submodule. Throughout this process, I identified other projects the maintainers team were working on, and also contributed there. For example, I added metadata routing to a variety of different functions and estimators in scikit-learn. I also began reviewing PRs for the tree submodule and metadata routing where I had knowledge. I also added missing-value support for extremely randomized tree models (called ExtraTrees in scikit-learn). This allows users to pass in data that contains missing values (encoded as `np.nan`) to ExtraTrees. Around this time, I was invited to join the maintainer team of scikit-learn. More recently, I have taken on the project to add [categorical data support](https://github.com/scikit-learn/scikit-learn/pull/29437) to the decision tree models, which will make random forests and extremely randomized tree models more performant and capable to handle real world settings where there is commonly categorical data. + + +1. __To which OSS projects and communities do you contribute?__ + + I currently primarily contribute to scikit-learn, [PyWhy](https://github.com/py-why/dodiscover) (a community for causal inference in Python), and also develop my own OSS project: [treeple](https://github.com/neurodata/treeple). Treeple is an exciting package that implements different decision tree models beyond those offered in scikit-learn with an efficient Cython implementation stemming from the scikit-learn tree internals. + + +1. __What do you find alluring about OSS?__ + + OSS is so exciting because of the impact it has. Everyone from private projects to other OSS projects will use OSS. Any fixes to documentation, performance improvements, or new features will potentially impact the workflows of potentially millions of people. This is what makes contributing to OSS so exciting. Moreover, this impact ensures that best practices are usually carried out in these projects, and it’s a great playground to learn from the best, while giving back to the larger community. + + +1. __What pain points do you observe in community-led OSS?__ + + Right now, community lead OSS moves very slowly in most places. This is for a number of very good reasons: i) not releasing buggy features that may impact millions of people, and ii) backwards compatibility. One of the challenges of maintaining a high-quality OSS project is that you would like to satisfy your users, who may all utilize different components of the project from different versions. As such, many community led OSS projects take a conservative approach when implementing new features and new ideas. However, there may be many exciting better features that are already known by the community, but still lack an OSS implementation. + + I think this can be partially solved by increased funding for OSS, so OSS maintainers and developers are able to dedicate more time to maintaining and improving the projects. In addition, I think this can be improved if more developers in the community contribute to said OSS projects. I hope that I have convinced you though that contributing to OSS is impactful and highly educational. + + +1. __If we discuss how far OS has evolved in 10 years, what would you like to see happen?__ + + I think more interoperability and integrated workflows for projects will make projects that utilize OSS more streamlined and efficient. For example, right now there are different array libraries (e.g. numpy, cupy, xarray, pytorch, etc.), which all support some manner of a n-dimensional array, but with a slightly different API. This makes it very painful to transition across different libraries that use different arrays. In addition, there are multiple dataframe libraries, such as pandas and polars, and this problem of API consistency also arises there. + + Some work has been made on the Array-API front to allow different array libraries to serve as backends given a common API. This will enable GPU acceleration for free without a single code change, which is great! This will be exciting because users will eventually only have to write code in a single way, and can then leverage any array/dataframe library that has different advantages and disadvantages based on the user use case. + + +1. __What are your hobbies, outside of work and open source?__ + + I enjoy running, trying new restaurants and bars, cooking and reading. I’m currently training for a half-marathon, where my goal is to run under 8 minutes per mile. I’m also trying to perfect a salad with an asian-themed dressing. In a past life, I was a bboy (breakdancer) for ten years until I stopped in graduate school because I got busy (and old). + diff --git a/_posts/2024-08-06-czi-eoss6-announcement.md b/_posts/2024-08-06-czi-eoss6-announcement.md new file mode 100644 index 0000000..0dacc86 --- /dev/null +++ b/_posts/2024-08-06-czi-eoss6-announcement.md @@ -0,0 +1,110 @@ +--- +title: "Chan Zuckerberg Initiative considers scikit-learn an Essential Open Source Software" +date: August 6, 2024 +categories: + - Funding +tags: + - Open Source + - Funding + - Internship + - Diversity +featured-image: sklearn_czi.png + +postauthors: + - name: Guillaume Lemaitre + website: https://github.com/glemaitre + image: guillaume-lemaitre.jpg + - name: Lucy Liu + website: https://github.com/lucyleeow + image: lucyliu.jpeg +--- +
+ + {% include postauthor.html %} +
+ +We are delighted to announce that `scikit-learn` has been awarded a grant from +the [Chan Zuckerberg Initiative (CZI)](https://chanzuckerberg.com/)'s [Essential Open +Source Software for Science +(EOSS)](https://chanzuckerberg.com/rfa/essential-open-source-software-for-science/) +program. This grant is funded by [Wellcome Trust](https://wellcome.org/). +As in previous rounds, this cycle supports open-source software projects that are +essential to biomedical research. This is the third time that CZI EOSS supports +`scikit-learn`. + +In this new grant, we will focus on improving the [evaluation and inspection of +predictive +models](https://chanzuckerberg.com/eoss/proposals/predictive-models-evaluation-inspection-in-scikit-learn/). + +## Predictive models evaluation & inspection + +When building a machine learning pipeline for a specific research problem, two key +aspects are closely connected: (i) design of the pipeline and (ii) assessment, analysis, and +inspection of it. Researchers strive to identify the optimal pipeline, maximizing specific +evaluation metrics, while also seeking at explaining the validity and rationale behind +the pipeline's predictions. This is the cornerstone of answering research +questions. With this proposal we aim to improve and extend the available `scikit-learn` +tools. + +`scikit-learn` provides building blocks for model evaluation and statistical analysis of +results. Originally, this information was presented in a raw format and required +expertise from scientists to create intuitive reports for outreach to peers and +outsiders. Recently, the `scikit-learn` community developed displays to easily generate +visual figures for communicating such results. However, these displays are still in +their early development stages and do not leverage all available statistical analysis +tools (i.e., cross-validation) from `scikit-learn`. Thus, we aim to expand these +displays, using the right statistical tools and thus promote the adoption of best +practices when reporting results. Additionally, we also intend to create new displays +to support common analysis tasks that are not yet covered in `scikit-learn`. + +In the domain of model inspection, we aim to address several areas: (i) model inspection +during training, (ii) enhancing user experience through interactive inspection, and +(iii) model explainability. First, during the training of a pipeline, researchers are +interested in monitoring the internal characteristics of the model, which is a not yet +addressed long-standing issue in `scikit-learn`. We want to build upon some initial work +by implementing a "callback" framework that allows users to track these internal +parameters. Next, researchers commonly use interactive tools such as Jupyter Notebook to +develop pipelines. `scikit-learn` started some efforts to visually and interactively +display pipelines in these environments. However, there is room for improvement in terms +of user interaction and accessibility. Finally, as `scikit-learn` is widely used as a +reference package, it is crucial to improve the section of the library dedicated to +model explainability. We aim to improve the documentation and user experience with the +existing explainability tools, making sure that they use the appropriate tool for their +use cases. In addition, we propose to work on a scikit-learn enhancement proposal (SLEP) +to define a common API for model explainability within scikit-learn. Ultimately, the +goal is to come to a consensus to provide scikit-learn end-users with a consistent +experience when using model explainability tools. + +On top of all these items, we intend to continue working on the general maintenance of +the project, addressing bug reports and performance regressions. As a community-driven +project, we also want to dedicate time reviewing external contributions. + +## Involved people + +To execute this project, we plan the following hires: + +- [Lucy Liu](https://github.com/lucyleeow) (Quansight Labs) will work about half-time on + the project, on topic related to displays and feature importance. +- We will hire full-time internships to work on the other part of the project. The + initial plan is to hire two interns for a period of 6 months each and repeat this + process for the next 2 years. We want to provide opportunities to underrepresented + groups in the field of machine learning and data science, similarly to previous + initiatives (cf. [NumFOCUS Small Development + Grant](https://blog.scikit-learn.org/diversity/mentoring/)). + +## Past CZI EOSS grants + +In the past `scikit-learn` has been awarded two grants from the CZI EOSS program: + +- [CZI EOSS Cycle 1](https://chanzuckerberg.com/eoss/proposals/scikit-learn-maintenance-and-enhancement-for-gradient-boosting/) + helped at creating to the + [`HistGradientBoostingClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html) and + [`HistGradientBoostingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html) estimators. + These estimators are the equivalent of gradient boosting models implemented in + `LightGBM` and `XGBoost`. +- [CZI EOSS Cycle 4](https://chanzuckerberg.com/eoss/proposals/maintenance-extension-of-scikit-learn-machine-learning-in-python/) + extended `scikit-learn` to work better with missing values and categorical data in + several estimators. + +Both grants allowed us to maintain and enhance `scikit-learn` to better serve the +community. diff --git a/_posts/2024-09-02-survey-announcement.md b/_posts/2024-09-02-survey-announcement.md new file mode 100644 index 0000000..c945577 --- /dev/null +++ b/_posts/2024-09-02-survey-announcement.md @@ -0,0 +1,55 @@ +--- +title: "Announcing the launch of the scikit-learn user survey" +date: September 2, 2024 + +categories: + - Updates +tags: + - Community + - Open Source + +postauthors: + - name: Inessa Pawson + email: inessapawson@gmail.com + website: https://github.com/inessapawson + image: "inessa-pawson.jpg" + - name: François Goupil + email: francois.goupil@inria.fr + website: https://github.com/francoisgoupil + image: "francois_goupil.jpeg" +--- +
+ + {% include postauthor.html %} +
+ +We are excited to announce the launch of the scikit-learn user survey! Scikit-learn +continues to evolve thanks to contributions from its diverse user community. As we plan +for future releases, we want to ensure we are focusing on what matters most to you — our +users. + +The goal of this survey is to better understand how users interact with the library, +identify any pain points, learn about the features you find most useful, and what’s +missing. This is your chance to have a say in how the library grows and adapts to meet +the evolving needs of the machine learning community. + +The survey will take about 15 minutes of your time. It is available in Arabic, French, +English, Japanese, Mandarin, Spanish, and Portuguese. You have the option to remain +completely anonymous, and the data collected will be used solely for the purpose of +improving scikit-learn. + +This user survey is a truly collaborative effort. We would like to thank the teams from +probabl, University of Oxford (UK), and POSSEE OpenTeams, as well as many scikit-learn +contributors, for their time and effort in designing and translating it. + +Once the survey closes, we’ll analyze the responses and publish the findings in a +follow-up blog post. + +To take the survey, visit: +[https://forms.gle/p5P7AweCJCbFMzfo6](https://forms.gle/p5P7AweCJCbFMzfo6). +The survey will remain open until October 14th, 2024, and we encourage you to share it with your +colleagues and extended network. + +We value every contribution in our community, and we’re committed to making scikit-learn +even better. Your feedback is the foundation upon which scikit-learn will continue to +grow and evolve. We look forward to hearing from you! diff --git a/_posts/2024-12-05-dev-api.md b/_posts/2024-12-05-dev-api.md new file mode 100644 index 0000000..0ebf4e2 --- /dev/null +++ b/_posts/2024-12-05-dev-api.md @@ -0,0 +1,141 @@ +--- +#### Blog Post Template #### + +#### Post Information #### +title: "Changes and development of scikit-learn's developer API" +date: December 12, 2024 + +#### Post Category and Tags #### +# Format in titlecase without dashes (Ex. "Open Source" instead of "open-source") +categories: + - Updates +tags: + - Open Source + - Machine Learning + - License + +#### Featured Image #### +featured-image: BSD_watermark.svg + +#### Author Info #### +# Can accomodate multiple authors +# Add SQUARE Author Image to /assets/images/author_images/ folder +postauthors: + - name: Adrin Jalali + website: https://adrin.info/ + image: adrin-jalali.jpeg +--- +
+ + {% include postauthor.html %} +
+ +Historically, scikit-learn's API has been divided into public and private. Public API is +intended to be used by users, and private API is used internally in scikit-learn to +develop new features and estimators. However, many of those functionalities have become +essential to develop scikit-learn estimators by third parties who develop them outside +the scikit-learn codebase. + +When it comes to our public API, we have very strict and high standards on backward +compatibility. The rule of thumb is that no change should cause a change in users' +code unless we warn about it for two release cycles, which means we give users a year +time to update their code. + +On the other hand, we have no such guarantees or constraints on our private API. This +brings an issue to third party developers who would like to use methods used by +scikit-learn developers to develop their estimators. Constantly changing private API +without prior warning brings certain challenges to third party developers which is not +ideal. + +As a result, we've been working on creating a developer API which would sit somewhere +between our public and private API in terms of backward compatibility. That means we +intend to try to keep that API stable, and if needed, introduce changes with one release +cycle warning. + +In the past few releases, we've slowly introduced more functionalities under this +umbrella. `__sklearn_clone__` and `__sklearn_is_fitted__` are two examples. + +In the 1.6 release, we focused on the testing infrastructure and estimator tag system. +Estimator tags used to be private, and we were not sure about their design. In the 1.6 +release, new tags are introduced and using them looks like the following: + +```python +from sklearn.base import BaseEstimator, ClassifierMixin + +class MyEstimator(ClassifierMixin, BaseEstimator): + + ... + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # modify tags here + tags.non_deterministic = True + return tags +``` + +The new tags mostly follow the same structure as the old tags, but there are certain +changes to them. The main change is that the old `_xfail_checks` is no longer present +in the new tags. That tag was used to tell the common testing tools about the tests +which are known to fail and are to be skipped. That information is now directly passed +to the test functionalities. The old way of skipping a test was the following: + +```python +from sklearn.base import BaseEstimator, ClassifierMixin + +class MyEstimator(ClassifierMixin, BaseEstimator): + + ... + + def _more_tags(self): + return { + "_xfail_checks": { + "check_to_skip_name": "this check is known to fail", + ... + } + } +``` + +And then when calling `check_estimator` or using `parametrize_with_checks` with `pytest` +would automatically ignore those tests for the estimator. + +Instead, in this release, you pass that information directly to those methods: + +```python +from sklearn.utils.estimator_checks import check_estimator, parametrize_with_checks + +CHECKS_EXPECTED_TO_FAIL = { + "check_to_skip_name": "this check is known to fail", + ... +} + +# Using check_estimator +def test_with_check_estimator(): + check_estimator(MyEstimator(), expected_failed_checks=CHECKS_EXPECTED_TO_FAIL) + +# Using parametrize_with_checks +@parametrize_with_checks( + [MyEstimator()], + expected_failed_checks=lambda est: CHECKS_EXPECTED_TO_FAIL +) +def test_with_parametrize_with_checks(estimator, check): + check(estimator) +``` + +While working on the testing infrastructure, we have also been working on improving our +tests and that means in this release we had a particularly high number of changes in +their names and what they do. The changes will make it easier for developers to fix +issues with their estimators. Note that you can now pass `legacy=False` to both +`check_estimator` and `parametrize_with_checks` to include only strictly API related +tests. + +The above changes mean developers need to update their estimators and depending on +what they use, write scikit-learn version specific code to handle supporting multiple +scikit-learn versions. To make that process easier, we've worked on a package called +[`sklearn_compat`](https://github.com/sklearn-compat/sklearn-compat/). You can either +depend on it as a package dependency, or vendor a single file inside your project. At +the moment this project is in its infancy and might change in the future. But hopefully +it helps developers out there. + +If you think there are missing functionalities in the developer API, please let us know +and give us feedback on our [issue tracker]( +https://github.com/scikit-learn/scikit-learn/issues). diff --git a/_sass/minimal-mistakes/_buttons.scss b/_sass/minimal-mistakes/_buttons.scss index 9ef60a8..fe57efb 100644 --- a/_sass/minimal-mistakes/_buttons.scss +++ b/_sass/minimal-mistakes/_buttons.scss @@ -39,7 +39,10 @@ (info, $info-color), (facebook, $facebook-color), (twitter, $twitter-color), - (linkedin, $linkedin-color); + (linkedin, $linkedin-color), + (bluesky, $bluesky-color), + (mastodon, $mastodon-color); + @each $buttoncolor, $color in $buttoncolors { &--#{$buttoncolor} { diff --git a/_sass/minimal-mistakes/_utilities.scss b/_sass/minimal-mistakes/_utilities.scss index 0200774..57ffd4b 100644 --- a/_sass/minimal-mistakes/_utilities.scss +++ b/_sass/minimal-mistakes/_utilities.scss @@ -307,6 +307,16 @@ body:hover .visually-hidden button { color: $twitter-color; } + .fa-bluesky, + .fa-bluesky-square { + color: $bluesky-color; + } + + .fa-mastodon, + .fa-mastodon-square { + color: $mastodon-color; + } + .fa-vimeo, .fa-vimeo-square, .fa-vimeo-v { diff --git a/_sass/minimal-mistakes/_variables.scss b/_sass/minimal-mistakes/_variables.scss index 3f60b96..03488fc 100644 --- a/_sass/minimal-mistakes/_variables.scss +++ b/_sass/minimal-mistakes/_variables.scss @@ -97,6 +97,7 @@ $yiq-debug: false !default; /* brands */ $behance-color: #1769ff !default; $bitbucket-color: #205081 !default; +$bluesky-color: #448dee !default; $dribbble-color: #ea4c89 !default; $facebook-color: #3b5998 !default; $flickr-color: #ff0084 !default; diff --git a/assets/.DS_Store b/assets/.DS_Store deleted file mode 100644 index 3128289..0000000 Binary files a/assets/.DS_Store and /dev/null differ diff --git a/assets/css/main.scss b/assets/css/main.scss index 1543678..040bdbf 100644 --- a/assets/css/main.scss +++ b/assets/css/main.scss @@ -1,6 +1,6 @@ ---- -# Only the main Sass file needs front matter (the dashes are enough) ---- +// --- +// # Only the main Sass file needs front matter (the dashes are enough) +// --- @charset "utf-8"; diff --git a/assets/images/.DS_Store b/assets/images/.DS_Store deleted file mode 100644 index 6e084ab..0000000 Binary files a/assets/images/.DS_Store and /dev/null differ diff --git a/assets/images/author_images/adam-li.jpeg b/assets/images/author_images/adam-li.jpeg new file mode 100644 index 0000000..e3c5dcd Binary files /dev/null and b/assets/images/author_images/adam-li.jpeg differ diff --git a/assets/images/author_images/guillaume-lemaitre.jpg b/assets/images/author_images/guillaume-lemaitre.jpg new file mode 100644 index 0000000..f2915fe Binary files /dev/null and b/assets/images/author_images/guillaume-lemaitre.jpg differ diff --git a/assets/images/author_images/inessa-pawson.jpg b/assets/images/author_images/inessa-pawson.jpg new file mode 100644 index 0000000..a36db17 Binary files /dev/null and b/assets/images/author_images/inessa-pawson.jpg differ diff --git a/assets/images/author_images/meekail-zain.jpg b/assets/images/author_images/meekail-zain.jpg new file mode 100644 index 0000000..06e92ff Binary files /dev/null and b/assets/images/author_images/meekail-zain.jpg differ diff --git a/assets/images/author_images/nvidia-logo.png b/assets/images/author_images/nvidia-logo.png new file mode 100644 index 0000000..4f33994 Binary files /dev/null and b/assets/images/author_images/nvidia-logo.png differ diff --git a/assets/images/author_images/sangam_swadik.jpg b/assets/images/author_images/sangam_swadik.jpg new file mode 100644 index 0000000..c28a0da Binary files /dev/null and b/assets/images/author_images/sangam_swadik.jpg differ diff --git a/assets/images/author_images/stefanie-senger.jpeg b/assets/images/author_images/stefanie-senger.jpeg new file mode 100644 index 0000000..2ec227f Binary files /dev/null and b/assets/images/author_images/stefanie-senger.jpeg differ diff --git a/assets/images/author_images/yao-xiao.jpeg b/assets/images/author_images/yao-xiao.jpeg new file mode 100644 index 0000000..21578bd Binary files /dev/null and b/assets/images/author_images/yao-xiao.jpeg differ diff --git a/assets/images/posts_images/2023-paris-dev-sprint.png b/assets/images/posts_images/2023-paris-dev-sprint.png new file mode 100644 index 0000000..b276ce8 Binary files /dev/null and b/assets/images/posts_images/2023-paris-dev-sprint.png differ diff --git a/assets/images/posts_images/2023-paris-sprint/paris_2023.jpg b/assets/images/posts_images/2023-paris-sprint/paris_2023.jpg new file mode 100644 index 0000000..c5a5441 Binary files /dev/null and b/assets/images/posts_images/2023-paris-sprint/paris_2023.jpg differ diff --git a/assets/images/posts_images/2023-paris-sprint/thomas_olivier.jpg b/assets/images/posts_images/2023-paris-sprint/thomas_olivier.jpg new file mode 100644 index 0000000..dea2ef1 Binary files /dev/null and b/assets/images/posts_images/2023-paris-sprint/thomas_olivier.jpg differ diff --git a/assets/images/posts_images/BSD_wordmark.svg b/assets/images/posts_images/BSD_wordmark.svg new file mode 100644 index 0000000..ed48262 --- /dev/null +++ b/assets/images/posts_images/BSD_wordmark.svg @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/assets/images/posts_images/NVIDIAxsklearn.jpg b/assets/images/posts_images/NVIDIAxsklearn.jpg new file mode 100644 index 0000000..cbf1afe Binary files /dev/null and b/assets/images/posts_images/NVIDIAxsklearn.jpg differ diff --git a/assets/images/posts_images/adam-li-interview.png b/assets/images/posts_images/adam-li-interview.png new file mode 100644 index 0000000..6d6c483 Binary files /dev/null and b/assets/images/posts_images/adam-li-interview.png differ diff --git a/assets/images/posts_images/meekail-zain-interview.png b/assets/images/posts_images/meekail-zain-interview.png new file mode 100644 index 0000000..45c05cb Binary files /dev/null and b/assets/images/posts_images/meekail-zain-interview.png differ diff --git a/assets/images/posts_images/pandas_output_sklearn_transformers.PNG b/assets/images/posts_images/pandas_output_sklearn_transformers.PNG new file mode 100644 index 0000000..95b707b Binary files /dev/null and b/assets/images/posts_images/pandas_output_sklearn_transformers.PNG differ diff --git a/assets/images/posts_images/scipy-la-2022_logo.png b/assets/images/posts_images/scipy-la-2022_logo.png new file mode 100644 index 0000000..738c1eb Binary files /dev/null and b/assets/images/posts_images/scipy-la-2022_logo.png differ diff --git a/assets/images/posts_images/sklearn_czi.png b/assets/images/posts_images/sklearn_czi.png new file mode 100644 index 0000000..f319841 Binary files /dev/null and b/assets/images/posts_images/sklearn_czi.png differ diff --git a/assets/notebooks/sklearn-pandas-df-output.ipynb b/assets/notebooks/sklearn-pandas-df-output.ipynb new file mode 100644 index 0000000..efd5570 --- /dev/null +++ b/assets/notebooks/sklearn-pandas-df-output.ipynb @@ -0,0 +1,1931 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "10353174-f58e-4885-aba2-00c9b5b0ab24", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.simplefilter(\"ignore\", FutureWarning)" + ] + }, + { + "cell_type": "markdown", + "id": "e885c43c", + "metadata": {}, + "source": [ + "# scikit-learn\n", + "## Example: Pandas DataFrame output for sklearn transformer\n", + "\n", + "### Outline\n", + "- Example 1a: iris dataset (`StandardScalar` transformation)\n", + "- Example 1b: iris dataset (`PolynomialFeatures` transformation)\n", + "- Example 2: titanic dataset (with a Pipeline)\n", + "\n", + "### Prepared by\n", + "- Andreas Mueller\n", + "- Reshama Shaikh\n", + "\n", + "### Date\n", + "November 2022" + ] + }, + { + "cell_type": "markdown", + "id": "a711bdd1", + "metadata": {}, + "source": [ + "# Example 1a: iris dataset (`StandardScalar` transformation)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c327cbbd-44c6-4795-80fb-109c11707e77", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
605.02.03.51.0
14.93.01.40.2
84.42.91.40.2
935.02.33.31.0
1064.92.54.51.7
...............
665.63.04.51.5
294.73.21.60.2
1307.42.86.11.9
1416.93.15.12.3
1116.42.75.31.9
\n", + "

112 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)\n", + "60 5.0 2.0 3.5 1.0\n", + "1 4.9 3.0 1.4 0.2\n", + "8 4.4 2.9 1.4 0.2\n", + "93 5.0 2.3 3.3 1.0\n", + "106 4.9 2.5 4.5 1.7\n", + ".. ... ... ... ...\n", + "66 5.6 3.0 4.5 1.5\n", + "29 4.7 3.2 1.6 0.2\n", + "130 7.4 2.8 6.1 1.9\n", + "141 6.9 3.1 5.1 2.3\n", + "111 6.4 2.7 5.3 1.9\n", + "\n", + "[112 rows x 4 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_iris\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X, y = load_iris(as_frame=True, return_X_y=True)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)\n", + "X_train" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c30feda5-8f07-4a9e-b2ba-2b19399e255b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.89426443, 0.7983005 , -1.27141116, -1.32760471],\n", + " [-1.24446588, -0.08694362, -1.32740725, -1.45907396],\n", + " [-0.66079679, 1.46223359, -1.27141116, -1.32760471],\n", + " [-0.89426443, 0.57698947, -1.15941899, -0.93319694],\n", + " [-0.42732916, -1.4148098 , -0.03949724, -0.27585067],\n", + " [-0.19386152, -0.52956568, 0.40847146, 0.1185571 ],\n", + " [-0.19386152, -0.52956568, 0.18448711, 0.1185571 ],\n", + " [-1.12773206, 0.13436741, -1.27141116, -1.45907396],\n", + " [ 0.15633993, 0.7983005 , 0.40847146, 0.51296486],\n", + " [ 1.55714575, -0.08694362, 1.1364206 , 0.51296486],\n", + " [ 0.50654139, 0.57698947, 1.24841277, 1.69618815],\n", + " [-0.31059534, -0.52956568, 0.63245581, 1.03884188],\n", + " [-0.0771277 , -0.75087671, 0.18448711, -0.27585067],\n", + " [ 0.50654139, -0.30825465, 1.02442842, 0.77590337],\n", + " [-0.42732916, -1.19349877, 0.12849102, 0.1185571 ],\n", + " [-0.89426443, 1.46223359, -1.27141116, -1.0646662 ],\n", + " [-1.47793352, 0.13436741, -1.27141116, -1.32760471],\n", + " [ 0.74000903, -0.08694362, 0.80044407, 1.03884188],\n", + " [-0.89426443, 1.68354462, -1.27141116, -1.19613545],\n", + " [ 0.62327521, 0.35567844, 0.40847146, 0.38149561],\n", + " [ 0.03960612, -0.08694362, 0.74444798, 0.77590337],\n", + " [-0.31059534, -0.08694362, 0.18448711, 0.1185571 ],\n", + " [ 0.97347666, 0.13436741, 0.52046363, 0.38149561],\n", + " [-0.89426443, 1.01961153, -1.32740725, -1.32760471],\n", + " [ 0.27307375, -0.97218774, 1.02442842, 0.25002635],\n", + " [-0.42732916, 1.01961153, -1.38340334, -1.32760471],\n", + " [-1.24446588, 0.7983005 , -1.04742681, -1.32760471],\n", + " [-0.42732916, -0.97218774, 0.35247537, -0.01291216],\n", + " [-0.0771277 , -0.75087671, 0.74444798, 0.90737262],\n", + " [ 0.97347666, 0.13436741, 0.35247537, 0.25002635],\n", + " [-0.89426443, 1.68354462, -1.21541508, -1.32760471],\n", + " [ 0.74000903, -0.52956568, 0.46446755, 0.38149561],\n", + " [ 1.2069443 , 0.13436741, 0.91243625, 1.17031113],\n", + " [-0.19386152, -1.19349877, 0.6884519 , 1.03884188],\n", + " [-1.47793352, 0.7983005 , -1.32740725, -1.19613545],\n", + " [ 1.32367812, 0.35567844, 0.52046363, 0.25002635],\n", + " [ 0.50654139, 0.7983005 , 1.02442842, 1.5647189 ],\n", + " [ 1.55714575, 1.24092256, 1.30440886, 1.69618815]])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n", + "\n", + "# set the transform method\n", + "scaler = StandardScaler()\n", + "\n", + "# transform the training data\n", + "scaler.fit(X_train)\n", + "\n", + "# transform the test data\n", + "X_test_scaled = scaler.transform(X_test)\n", + "\n", + "# Let's look at the output.\n", + "# Input data: X_train was a pandas dataframe\n", + "# Output data: X_test_scaled is a numpy array\n", + "\n", + "X_test_scaled" + ] + }, + { + "cell_type": "markdown", + "id": "01a80bd5", + "metadata": {}, + "source": [ + "# Example 1b: iris dataset (`PolynomialFeatures` transformation)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "72650a0b-3ad1-44d8-9a95-6441903cd71b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1. , 5. , 2. , ..., 12.25, 3.5 , 1. ],\n", + " [ 1. , 4.9 , 3. , ..., 1.96, 0.28, 0.04],\n", + " [ 1. , 4.4 , 2.9 , ..., 1.96, 0.28, 0.04],\n", + " ...,\n", + " [ 1. , 7.4 , 2.8 , ..., 37.21, 11.59, 3.61],\n", + " [ 1. , 6.9 , 3.1 , ..., 26.01, 11.73, 5.29],\n", + " [ 1. , 6.4 , 2.7 , ..., 28.09, 10.07, 3.61]])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We transform the training data (X_train) to polynomial features\n", + "# Output: type is ...\n", + "PolynomialFeatures().fit_transform(X_train)" + ] + }, + { + "cell_type": "markdown", + "id": "7c649ffc", + "metadata": {}, + "source": [ + "### Use `transform_output`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bd241d45-f95f-49d5-8b66-17adf66160d4", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import set_config\n", + "set_config(transform_output=\"pandas\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fce8d4ee-fd9d-4ed0-a5d3-e6c6e81bb1e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)sepal length (cm)^2sepal length (cm) sepal width (cm)sepal length (cm) petal length (cm)sepal length (cm) petal width (cm)sepal width (cm)^2sepal width (cm) petal length (cm)sepal width (cm) petal width (cm)petal length (cm)^2petal length (cm) petal width (cm)petal width (cm)^2
601.05.02.03.51.025.0010.0017.505.004.007.002.0012.253.501.00
11.04.93.01.40.224.0114.706.860.989.004.200.601.960.280.04
81.04.42.91.40.219.3612.766.160.888.414.060.581.960.280.04
931.05.02.33.31.025.0011.5016.505.005.297.592.3010.893.301.00
1061.04.92.54.51.724.0112.2522.058.336.2511.254.2520.257.652.89
................................................
661.05.63.04.51.531.3616.8025.208.409.0013.504.5020.256.752.25
291.04.73.21.60.222.0915.047.520.9410.245.120.642.560.320.04
1301.07.42.86.11.954.7620.7245.1414.067.8417.085.3237.2111.593.61
1411.06.93.15.12.347.6121.3935.1915.879.6115.817.1326.0111.735.29
1111.06.42.75.31.940.9617.2833.9212.167.2914.315.1328.0910.073.61
\n", + "

112 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " 1 sepal length (cm) sepal width (cm) petal length (cm) \\\n", + "60 1.0 5.0 2.0 3.5 \n", + "1 1.0 4.9 3.0 1.4 \n", + "8 1.0 4.4 2.9 1.4 \n", + "93 1.0 5.0 2.3 3.3 \n", + "106 1.0 4.9 2.5 4.5 \n", + ".. ... ... ... ... \n", + "66 1.0 5.6 3.0 4.5 \n", + "29 1.0 4.7 3.2 1.6 \n", + "130 1.0 7.4 2.8 6.1 \n", + "141 1.0 6.9 3.1 5.1 \n", + "111 1.0 6.4 2.7 5.3 \n", + "\n", + " petal width (cm) sepal length (cm)^2 \\\n", + "60 1.0 25.00 \n", + "1 0.2 24.01 \n", + "8 0.2 19.36 \n", + "93 1.0 25.00 \n", + "106 1.7 24.01 \n", + ".. ... ... \n", + "66 1.5 31.36 \n", + "29 0.2 22.09 \n", + "130 1.9 54.76 \n", + "141 2.3 47.61 \n", + "111 1.9 40.96 \n", + "\n", + " sepal length (cm) sepal width (cm) sepal length (cm) petal length (cm) \\\n", + "60 10.00 17.50 \n", + "1 14.70 6.86 \n", + "8 12.76 6.16 \n", + "93 11.50 16.50 \n", + "106 12.25 22.05 \n", + ".. ... ... \n", + "66 16.80 25.20 \n", + "29 15.04 7.52 \n", + "130 20.72 45.14 \n", + "141 21.39 35.19 \n", + "111 17.28 33.92 \n", + "\n", + " sepal length (cm) petal width (cm) sepal width (cm)^2 \\\n", + "60 5.00 4.00 \n", + "1 0.98 9.00 \n", + "8 0.88 8.41 \n", + "93 5.00 5.29 \n", + "106 8.33 6.25 \n", + ".. ... ... \n", + "66 8.40 9.00 \n", + "29 0.94 10.24 \n", + "130 14.06 7.84 \n", + "141 15.87 9.61 \n", + "111 12.16 7.29 \n", + "\n", + " sepal width (cm) petal length (cm) sepal width (cm) petal width (cm) \\\n", + "60 7.00 2.00 \n", + "1 4.20 0.60 \n", + "8 4.06 0.58 \n", + "93 7.59 2.30 \n", + "106 11.25 4.25 \n", + ".. ... ... \n", + "66 13.50 4.50 \n", + "29 5.12 0.64 \n", + "130 17.08 5.32 \n", + "141 15.81 7.13 \n", + "111 14.31 5.13 \n", + "\n", + " petal length (cm)^2 petal length (cm) petal width (cm) \\\n", + "60 12.25 3.50 \n", + "1 1.96 0.28 \n", + "8 1.96 0.28 \n", + "93 10.89 3.30 \n", + "106 20.25 7.65 \n", + ".. ... ... \n", + "66 20.25 6.75 \n", + "29 2.56 0.32 \n", + "130 37.21 11.59 \n", + "141 26.01 11.73 \n", + "111 28.09 10.07 \n", + "\n", + " petal width (cm)^2 \n", + "60 1.00 \n", + "1 0.04 \n", + "8 0.04 \n", + "93 1.00 \n", + "106 2.89 \n", + ".. ... \n", + "66 2.25 \n", + "29 0.04 \n", + "130 3.61 \n", + "141 5.29 \n", + "111 3.61 \n", + "\n", + "[112 rows x 15 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Output: type is ...\n", + "PolynomialFeatures().fit_transform(X_train)" + ] + }, + { + "cell_type": "markdown", + "id": "644cc7e7", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "a035d647", + "metadata": {}, + "source": [ + "# Example 2: titanic dataset (with a Pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2704fbbb-eefd-4fdb-a7ae-428eef539d54", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's go back to the default settings\n", + "set_config(transform_output=\"default\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dff9ffc1-daa0-4de1-b494-a84a3b12816e", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_openml\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X, y = fetch_openml(\"titanic\", version=1, as_frame=True, return_X_y=True)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2ba9f7e6-d903-42d0-a56e-4c514c83a2c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pclassnamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
8683.0Holm, Mr. John Fredrik Alexandermale43.000.00.0C 70756.4500NoneSNoneNaNNone
9743.0Lobb, Mr. William Arthurmale30.001.00.0A/5. 333616.1000NoneSNoneNaNNone
6993.0Cacic, Mr. Lukamale38.000.00.03150898.6625NoneSNoneNaNCroatia
10443.0Murphy, Miss. NorafemaleNaN0.00.03656815.5000NoneQ16NaNNone
5452.0Renouf, Mrs. Peter Henry (Lillian Jefferys)female30.003.00.03102721.0000NoneSNoneNaNElizabeth, NJ
..........................................
11113.0Peacock, Master. Alfred Edwardmale0.751.01.0SOTON/O.Q. 310131513.7750NoneSNoneNaNNone
11233.0Peter, Mrs. Catherine (Catherine Rizk)femaleNaN0.02.0266822.3583NoneCDNaNNone
1121.0Fortune, Miss. Ethel Florafemale28.003.02.019950263.0000C23 C25 C27S10NaNWinnipeg, MB
13033.0Yousseff, Mr. GeriousmaleNaN0.00.0262714.4583NoneCNoneNaNNone
841.0Cumings, Mr. John Bradleymale39.001.00.0PC 1759971.2833C85CNoneNaNNew York, NY
\n", + "

981 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " pclass name sex age \\\n", + "868 3.0 Holm, Mr. John Fredrik Alexander male 43.00 \n", + "974 3.0 Lobb, Mr. William Arthur male 30.00 \n", + "699 3.0 Cacic, Mr. Luka male 38.00 \n", + "1044 3.0 Murphy, Miss. Nora female NaN \n", + "545 2.0 Renouf, Mrs. Peter Henry (Lillian Jefferys) female 30.00 \n", + "... ... ... ... ... \n", + "1111 3.0 Peacock, Master. Alfred Edward male 0.75 \n", + "1123 3.0 Peter, Mrs. Catherine (Catherine Rizk) female NaN \n", + "112 1.0 Fortune, Miss. Ethel Flora female 28.00 \n", + "1303 3.0 Yousseff, Mr. Gerious male NaN \n", + "84 1.0 Cumings, Mr. John Bradley male 39.00 \n", + "\n", + " sibsp parch ticket fare cabin embarked boat \\\n", + "868 0.0 0.0 C 7075 6.4500 None S None \n", + "974 1.0 0.0 A/5. 3336 16.1000 None S None \n", + "699 0.0 0.0 315089 8.6625 None S None \n", + "1044 0.0 0.0 36568 15.5000 None Q 16 \n", + "545 3.0 0.0 31027 21.0000 None S None \n", + "... ... ... ... ... ... ... ... \n", + "1111 1.0 1.0 SOTON/O.Q. 3101315 13.7750 None S None \n", + "1123 0.0 2.0 2668 22.3583 None C D \n", + "112 3.0 2.0 19950 263.0000 C23 C25 C27 S 10 \n", + "1303 0.0 0.0 2627 14.4583 None C None \n", + "84 1.0 0.0 PC 17599 71.2833 C85 C None \n", + "\n", + " body home.dest \n", + "868 NaN None \n", + "974 NaN None \n", + "699 NaN Croatia \n", + "1044 NaN None \n", + "545 NaN Elizabeth, NJ \n", + "... ... ... \n", + "1111 NaN None \n", + "1123 NaN None \n", + "112 NaN Winnipeg, MB \n", + "1303 NaN None \n", + "84 NaN New York, NY \n", + "\n", + "[981 rows x 13 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "736d3546-051b-4eca-bdee-3aa885c0302b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('columntransformer',\n",
+       "                 ColumnTransformer(transformers=[('pipeline',\n",
+       "                                                  Pipeline(steps=[('simpleimputer',\n",
+       "                                                                   SimpleImputer()),\n",
+       "                                                                  ('standardscaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['age', 'fare']),\n",
+       "                                                 ('onehotencoder',\n",
+       "                                                  OneHotEncoder(sparse=False),\n",
+       "                                                  ['embarked', 'sex',\n",
+       "                                                   'pclass'])],\n",
+       "                                   verbose_feature_names_out=False)),\n",
+       "                ('logisticregression', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('columntransformer',\n", + " ColumnTransformer(transformers=[('pipeline',\n", + " Pipeline(steps=[('simpleimputer',\n", + " SimpleImputer()),\n", + " ('standardscaler',\n", + " StandardScaler())]),\n", + " ['age', 'fare']),\n", + " ('onehotencoder',\n", + " OneHotEncoder(sparse=False),\n", + " ['embarked', 'sex',\n", + " 'pclass'])],\n", + " verbose_feature_names_out=False)),\n", + " ('logisticregression', LogisticRegression())])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.compose import make_column_transformer\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Here we use `StandardScaler` for continuous variables; \n", + "# then we impute for missing data (check the documentation for the imputation method)\n", + "# We use `OneHotEncoder` for categorical variables\n", + "# NOTE: we are using a subset of the features (not all the columns)\n", + "\n", + "ct = make_column_transformer((make_pipeline(SimpleImputer(), \n", + " StandardScaler()), [\"age\", \"fare\"]),\n", + " (OneHotEncoder(sparse=False), [\"embarked\", \"sex\", \"pclass\"]), \n", + " verbose_feature_names_out=False)\n", + "\n", + "# Note: click on pipeline elements to see more details\n", + "clf = make_pipeline(ct, LogisticRegression())\n", + "clf" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6a88f293-9019-4915-b526-f7a1f871c518", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7865853658536586" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(X_train, y_train)\n", + "clf.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "549b42bd-c84a-4348-9ecb-b261909dca9d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.44376706, -0.50468405, 0. , ..., 0. ,\n", + " 0. , 1. ],\n", + " [-0.44376706, 0.05957877, 1. , ..., 0. ,\n", + " 1. , 0. ],\n", + " [ 0.47033007, -0.34728169, 0. , ..., 0. ,\n", + " 0. , 1. ],\n", + " ...,\n", + " [ 0.39415531, 0.45300503, 1. , ..., 1. ,\n", + " 0. , 0. ],\n", + " [ 0.01328151, -0.14910643, 0. , ..., 1. ,\n", + " 0. , 0. ],\n", + " [-0.82464087, -0.39564025, 0. , ..., 0. ,\n", + " 1. , 0. ]])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's remove the last step in the pipeline (which is LogisticRegression()) & transform the X_test data\n", + "\n", + "clf[:-1].transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "13d10de4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7028ada4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pclassnamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
7203.0Colbert, Mr. Patrickmale24.00.00.03711097.2500NoneQNoneNaNCo Limerick, Ireland Sherbrooke, PQ
4942.0Mallet, Mrs. Albert (Antoinette Magnin)female24.01.01.0S.C./PARIS 207937.0042NoneC10NaNParis / Montreal, PQ
9683.0Lindell, Mr. Edvard Bengtssonmale36.01.00.034991015.5500NoneSANaNNone
\n", + "
" + ], + "text/plain": [ + " pclass name sex age sibsp \\\n", + "720 3.0 Colbert, Mr. Patrick male 24.0 0.0 \n", + "494 2.0 Mallet, Mrs. Albert (Antoinette Magnin) female 24.0 1.0 \n", + "968 3.0 Lindell, Mr. Edvard Bengtsson male 36.0 1.0 \n", + "\n", + " parch ticket fare cabin embarked boat body \\\n", + "720 0.0 371109 7.2500 None Q None NaN \n", + "494 1.0 S.C./PARIS 2079 37.0042 None C 10 NaN \n", + "968 0.0 349910 15.5500 None S A NaN \n", + "\n", + " home.dest \n", + "720 Co Limerick, Ireland Sherbrooke, PQ \n", + "494 Paris / Montreal, PQ \n", + "968 None " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# untransformed data (has all the orginal columns)\n", + "X_test.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "664872d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "S 222\n", + "C 68\n", + "Q 37\n", + "NaN 1\n", + "Name: embarked, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test['embarked'].value_counts(normalize=False, dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c7bdf8bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "male 211\n", + "female 117\n", + "Name: sex, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test['sex'].value_counts(normalize=False, dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d672e590", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0 184\n", + "1.0 80\n", + "2.0 64\n", + "Name: pclass, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test['pclass'].value_counts(normalize=False, dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6437e2f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Notes\n", + "# a) Notice first two columns are numbers and from our transformer should be \"age\" and \"fare\"\n", + "# b) Next 4 cols are embarked (4 possible values); sex (2 possible values here); pclass (3 possible values)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ed70d7ba-8b82-4cb8-a0ed-ae3825222134", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910
0-0.443767-0.5046840.01.00.00.00.01.00.00.01.0
1-0.4437670.0595791.00.00.00.01.00.00.01.00.0
20.470330-0.3472820.00.01.00.00.01.00.00.01.0
32.1461752.2681100.00.01.00.01.00.01.00.00.0
41.5367771.3760851.00.00.00.00.01.01.00.00.0
....................................
323-1.6625630.0547580.00.01.00.00.01.00.01.00.0
3242.4508740.8749570.00.00.01.01.00.01.00.00.0
3250.3941550.4530051.00.00.00.01.00.01.00.00.0
3260.013282-0.1491060.00.01.00.00.01.01.00.00.0
327-0.824641-0.3956400.00.01.00.00.01.00.01.00.0
\n", + "

328 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 10\n", + "0 -0.443767 -0.504684 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0\n", + "1 -0.443767 0.059579 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0\n", + "2 0.470330 -0.347282 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0\n", + "3 2.146175 2.268110 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0\n", + "4 1.536777 1.376085 1.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0\n", + ".. ... ... ... ... ... ... ... ... ... ... ...\n", + "323 -1.662563 0.054758 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0\n", + "324 2.450874 0.874957 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0\n", + "325 0.394155 0.453005 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0\n", + "326 0.013282 -0.149106 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0\n", + "327 -0.824641 -0.395640 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0\n", + "\n", + "[328 rows x 11 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# transformed data\n", + "pd.DataFrame(clf[:-1].transform(X_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4d5d154a-8412-44a8-ac16-b7e7212bfd92", + "metadata": {}, + "outputs": [], + "source": [ + "set_config(transform_output=\"pandas\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6cdd01e4-aef1-4ee1-97bf-f3e5b866c725", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agefareembarked_Cembarked_Qembarked_Sembarked_nansex_femalesex_malepclass_1.0pclass_2.0pclass_3.0
720-0.443767-0.5046840.01.00.00.00.01.00.00.01.0
494-0.4437670.0595791.00.00.00.01.00.00.01.00.0
9680.470330-0.3472820.00.01.00.00.01.00.00.01.0
1392.1461752.2681100.00.01.00.01.00.01.00.00.0
961.5367771.3760851.00.00.00.00.01.01.00.00.0
....................................
385-1.6625630.0547580.00.01.00.00.01.00.01.00.0
2842.4508740.8749570.00.00.01.01.00.01.00.00.0
2570.3941550.4530051.00.00.00.01.00.01.00.00.0
1940.013282-0.1491060.00.01.00.00.01.01.00.00.0
364-0.824641-0.3956400.00.01.00.00.01.00.01.00.0
\n", + "

328 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age fare embarked_C embarked_Q embarked_S embarked_nan \\\n", + "720 -0.443767 -0.504684 0.0 1.0 0.0 0.0 \n", + "494 -0.443767 0.059579 1.0 0.0 0.0 0.0 \n", + "968 0.470330 -0.347282 0.0 0.0 1.0 0.0 \n", + "139 2.146175 2.268110 0.0 0.0 1.0 0.0 \n", + "96 1.536777 1.376085 1.0 0.0 0.0 0.0 \n", + ".. ... ... ... ... ... ... \n", + "385 -1.662563 0.054758 0.0 0.0 1.0 0.0 \n", + "284 2.450874 0.874957 0.0 0.0 0.0 1.0 \n", + "257 0.394155 0.453005 1.0 0.0 0.0 0.0 \n", + "194 0.013282 -0.149106 0.0 0.0 1.0 0.0 \n", + "364 -0.824641 -0.395640 0.0 0.0 1.0 0.0 \n", + "\n", + " sex_female sex_male pclass_1.0 pclass_2.0 pclass_3.0 \n", + "720 0.0 1.0 0.0 0.0 1.0 \n", + "494 1.0 0.0 0.0 1.0 0.0 \n", + "968 0.0 1.0 0.0 0.0 1.0 \n", + "139 1.0 0.0 1.0 0.0 0.0 \n", + "96 0.0 1.0 1.0 0.0 0.0 \n", + ".. ... ... ... ... ... \n", + "385 0.0 1.0 0.0 1.0 0.0 \n", + "284 1.0 0.0 1.0 0.0 0.0 \n", + "257 1.0 0.0 1.0 0.0 0.0 \n", + "194 0.0 1.0 1.0 0.0 0.0 \n", + "364 0.0 1.0 0.0 1.0 0.0 \n", + "\n", + "[328 rows x 11 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(X_train, y_train)\n", + "\n", + "X_test_transformed = clf[:-1].transform(X_test)\n", + "X_test_transformed " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "6886048a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.3.3\n", + "1.21.2\n", + "1.2.dev0\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import sklearn\n", + "print(pd.__version__)\n", + "print(np.__version__)\n", + "print(sklearn.__version__)" + ] + }, + { + "cell_type": "markdown", + "id": "e8e8c2b2", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "cf92c4e3", + "metadata": {}, + "source": [ + "# Resources\n", + "\n", + "## Documentation\n", + "- Documentation of `set_output` API: https://scikit-learn.org/dev/auto_examples/miscellaneous/plot_set_output.html#sphx-glr-auto-examples-miscellaneous-plot-set-output-py\n", + "\n", + "## Try out the dev version\n", + "- scikit-learn 1.2.dev0\n", + "- Installing the nightly build:\n", + "http://scikit-learn.org/stable/developers/advanced_installation.html\n", + "\n", + "## Report any bugs or issues\n", + "#### We'd love to hear both about whether this helps your use cases and any bugs you find!\n", + "#### We are also specifically looking for feedback from library authors on how their experience is and if this change introduces any unexpected hassles.\n", + "Post any issues or bugs here: https://github.com/scikit-learn/scikit-learn/issues" + ] + }, + { + "cell_type": "markdown", + "id": "e0ee7985", + "metadata": {}, + "source": [ + "# FAQs\n", + "\n", + "### Q1: Why did this update take so long?\n", + "> There was no established dataframe when scikit-learn was initially released - and it was released in the context of scientific computing, in which dataframes make less sense. Adding it back in later was quite tricky because of the interactions of numpy and pandas, and the lack of annotated sparse formats.\n", + "\n", + "### Q2: Will pandas output be supported also in other estimators?\n", + "> This is still work in progress! Let us know what your use-cases are! What would you like to see?\n", + "\n", + "### Q3: Does this mean we could chain Column Transformers while referencing column names?\n", + "> Yes\n", + "\n", + "### Q4: When will version 1.2 be released?\n", + "> End of 2022!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e21eaaa5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/videos/NVIDIAxsklearn.mp4 b/assets/videos/NVIDIAxsklearn.mp4 new file mode 100644 index 0000000..7433bcc Binary files /dev/null and b/assets/videos/NVIDIAxsklearn.mp4 differ diff --git a/index.markdown b/index.markdown index dc308fc..cd549ea 100644 --- a/index.markdown +++ b/index.markdown @@ -11,6 +11,4 @@ sidebar: nav: "docs" --- - + diff --git a/welcome-bot/BannerCongratulations.jpg b/welcome-bot/BannerCongratulations.jpg new file mode 100644 index 0000000..ac4f2cb Binary files /dev/null and b/welcome-bot/BannerCongratulations.jpg differ diff --git a/welcome-bot/BannerThanks.jpg b/welcome-bot/BannerThanks.jpg new file mode 100644 index 0000000..6cd20e5 Binary files /dev/null and b/welcome-bot/BannerThanks.jpg differ diff --git a/welcome-bot/BannerWelcome.jpg b/welcome-bot/BannerWelcome.jpg new file mode 100644 index 0000000..725fce1 Binary files /dev/null and b/welcome-bot/BannerWelcome.jpg differ