Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Fix do all redirects #49

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Feb 12, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 242 additions & 0 deletions _websiteutils/make_redirects_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#!/usr/bin/env python3

import argparse
import functools
import logging
import multiprocessing
import os
import pathlib
import re
import tempfile
import shutil

"""
This script does three things that improve the website organization.

First, we used to host in the root of the webpage, but have now moved to
``/stable/``. We do not want obsolete links to link to nothing (or that has
been our policy), so we currently just keep the old version at the top level.
Here, instead, we either softlink to the newest version, or replace the file by
an html refresh redirect.

Second, it changes the canonical link in each html file to the newest version
found of the html file (including stable if its in the latest version.)

Third, the script adds a new div to the top of all the old webpages with
tag ``olddocs-message`` to warn users that the page is obsolete.

This script takes a while, and is destructive, so should probably be run on a
branch and pushed as a PR so it can easily be reverted.
"""

_log = logging.getLogger("make_redirect_links")


tocheck = [pathlib.Path("stable")] + [
pathlib.Path(f"{major}.{minor}.{micro}")
for major in range(6, -1, -1)
for minor in range(6, -1, -1)
for micro in range(6, -1, -1)
]

toignore = tocheck + [pathlib.Path(p) for p in [
"mpl-probscale",
"mpl_examples",
"mpl_toolkits",
"_webpageutils",
"xkcd",
"_sitemap",
"robots.txt",
"CNAME",
".git",
]]

logging.basicConfig(level=logging.DEBUG)


@functools.cache
def findlast(fname, tocheck):
"""
Check the directories listed in ``tocheck`` to see if they have
``fname`` in them. Return the first one found, or None
"""
for t in tocheck:
pnew = t / fname
if pnew.exists():
return t
return None


html_redirect = """<!DOCTYPE HTML>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="refresh" content="0;url={newurl}" />
<link rel="canonical" href="https://matplotlib.org/{canonical}" />
</head>
<body>
<h1>
The page been moved <a href="{newurl}">here</a>!
</h1>
</body>
</html>
"""

# note these are all one line so they are easy to search and replace in the
# html files (otherwise we need to close tags)
warn_banner_exists = (
'<div id="unreleased-message"> You are reading an old version of the '
'documentation (v{version}). For the latest version see '
'<a href="{url}">{url}</a></div>\n')


warn_banner_old = (
'<div id="unreleased-message"> You are reading an old version of the '
'documentation (v{version}). For the latest version see '
'<a href="/stable/">https://matplotlib.org/stable/</a> </div>\n')


def do_links(root0):
"""
Either soft link a file at the top level to its newest position,
or make an html redirect if it is an html file.
"""

_log.info(f"Doing links on {root0}")
for root, dirs, files in os.walk(root0):
for name in files:
fullname = pathlib.Path(root, name)
last = findlast(fullname, tocheck)
_log.debug(f"Checking: {fullname} found {last}")
if last is not None:
fullname.unlink()
oldname = last / fullname
# Need to do these relative to where the final is, but note
# that `Path.relative_to` does not allow '.' as a common path
# prefix, so we need to use `os.path.relpath` instead.
relpath = os.path.relpath(oldname, start=fullname.parent)
if name.endswith((".htm", ".html")):
# make an html redirect.
_log.info(f"Rewriting HTML: {fullname} in {last}")
with fullname.open("w") as fout:
st = html_redirect.format(
newurl=relpath,
canonical=oldname,
)
fout.write(st)
else:
# soft link
_log.info(f"Linking {fullname} to {oldname}")
fullname.symlink_to(relpath)


def do_canonicals(dname):
"""
For each html file in the versioned docs, make the canonical link point
to the newest version.
"""
_log.debug(f"Walking {dname}")
for fullname in dname.rglob("*.html"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, Path.rglob doesn't seem to support multiple patterns, but we do not have any .htm files.

_log.debug(f"Checking {fullname}")
basename = pathlib.Path(*fullname.parts[1:])
last = findlast(basename, tocheck)
if last is not None:
update_canonical(fullname, last, dname == tocheck[1])


def update_canonical(fullname, last, newest):
"""
Change the canonical link in *fullname* to the same link in the
version given by *last*. We do this with a regexp to prevent
removing any other content on a line that has the canonical link.

Also add a banner (div) in the body if an old version of the docs.

Note that if for some reason there are more than one canonical link
this will change all of them.
"""
pre = "https://matplotlib.org/"
pnew = last.joinpath(*fullname.parts[1:])
newcanon = f"{pre}{str(pnew)}"
_log.info(f"{fullname} to {pre}{str(pnew)}")
rec = re.compile(b'<link rel="canonical" href=".*"')
with tempfile.NamedTemporaryFile(delete=False) as fout:
found = False
with fullname.open("rb") as fin:
for line in fin:
if not found and b'<link rel="canonical"' in line:
new = f'<link rel="canonical" href="{newcanon}"'
ll = rec.sub(new.encode("utf-8"), line)
_log.debug(f"new {line}->{ll}")
fout.write(ll)
found = True
elif b'<body>' in line and not newest:
# add a warning right under:
fout.write(line)
line = next(fin)
if last == tocheck[0]:
new = warn_banner_exists.format(
version=fullname.parts[0],
url=newcanon)
else:
new = warn_banner_old.format(version=fullname.parts[0])
fout.write(new.encode("utf-8"))
if b'<div id="olddocs-message">' not in line:
# write the line out if it wasn't an olddocs-message:
fout.write(line)

else:
fout.write(line)

shutil.move(fout.name, fullname)


if __name__ == "__main__":

parser = argparse.ArgumentParser()

parser.add_argument("--np", type=int, help="Number of processors to use")
parser.add_argument("--no-canonicals", help="do not do canonical links",
action="store_true")
parser.add_argument("--no-redirects", help="do not do redirects links",
action="store_true")

args = parser.parse_args()
if args.np:
np = args.np
else:
np = None

# figure out the newest version and trim tocheck at the same time:
tocheck = tuple(p for p in tocheck if p.exists())
print(tocheck)

# html redirect or soft link most things in the top-level directory that
# are not other modules or versioned docs.
if not args.no_redirects:
for entry in os.scandir("."):
fullname = pathlib.Path(entry.name)
if fullname not in toignore:
if entry.is_dir():
do_links(entry.name)
elif fullname.suffix == ".html":
last = findlast(fullname, tocheck)
_log.debug(f"Checking: {fullname} found {last}")
if last is not None:
fullname.unlink()
_log.info(f"Rewriting HTML: {fullname} in {last}")
with fullname.open("w") as fout:
oldname = last / fullname
st = html_redirect.format(newurl=oldname,
canonical=oldname)
fout.write(st)
_log.info("Done links and redirects")

# change the canonical url for all html to the newest version in the docs:
if not args.no_canonicals:
if np is not None:
with multiprocessing.Pool(np) as pool:
pool.map(do_canonicals, tocheck[1:])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This option now fails:

Traceback (most recent call last):
  File "/Users/jklymak/anaconda3/envs/matplotlibdev/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/Users/jklymak/anaconda3/envs/matplotlibdev/lib/python3.9/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "/Users/jklymak/matplotlib.github.com/_websiteutils/make_redirects_links.py", line 142, in do_canonicals
    last = findlast(basename, tocheck)
TypeError: unhashable type: 'list'

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps just remove it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, map must convert it from a tuple to a list.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so its the cache that is causing the problem? Happy to remove my optimization in favour of your optimization ;-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually no, this works fine for me; do you have some stashed changes? tocheck should be a tuple after df8ed61 (which was before 6fcc3b7).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That aside, I think this is working great....

else:
for t in tocheck[1:]:
do_canonicals(t)