Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Link mapping support #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 3 additions & 20 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,22 +1,5 @@
=============================
sphinxcontrib-serializinghtml
=============================
This is a fork of https://github.com/sphinx-doc/sphinxcontrib-serializinghtml

sphinxcontrib-serializinghtml is a sphinx extension which outputs
"serialized" HTML files (json and pickle).
Changes made to this fork are to facilitate the creation of JSON files suitable for consumption by React.

For more details, please visit http://www.sphinx-doc.org/.

Installing
==========

Install from PyPI::

pip install -U sphinxcontrib-serializinghtml

Contributing
============

See `CONTRIBUTING.rst`__

.. __: https://github.com/sphinx-doc/sphinx/blob/master/CONTRIBUTING.rst
Since those changes are very specific, they have not been contributed back to the original repo.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ classifiers = [
"Topic :: Text Processing",
"Topic :: Utilities",
]
dependencies = []
dependencies = ["beautifulsoup4"]
dynamic = ["version"]

[project.optional-dependencies]
Expand Down
69 changes: 62 additions & 7 deletions sphinxcontrib/serializinghtml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from sphinx.locale import get_translation
from sphinx.util.osutil import SEP, copyfile, ensuredir, os_path

from sphinxcontrib.serializinghtml import jsonimpl
from sphinxcontrib.serializinghtml import html_assists, jsonimpl

if TYPE_CHECKING:
from collections.abc import Sequence
Expand All @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ...
def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ...
def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ...

__version__ = '2.0.0'
__version__ = '2.0.0+Linaro-241028'
__version_info__ = (2, 0, 0)

package_dir = path.abspath(path.dirname(__file__))
Expand Down Expand Up @@ -55,7 +55,24 @@ class SerializingHTMLBuilder(StandaloneHTMLBuilder):

def init(self) -> None:
self.build_info = BuildInfo(self.config, self.tags)
self.imagedir = '_images'
# Cope with whether or not Sphinx has the required configuration variables
# set.
# See HTML Builder comments for explanation of image setup & handling
html_image_dir = None
try:
html_image_dir = self.get_builder_config('image_dir', 'html')
except AttributeError:
pass
if html_image_dir is not None:
self.imagedir = html_image_dir
else:
self.imagedir = '_images'
html_image_path = None
try:
html_image_path = self.get_builder_config('image_path', 'html')
except AttributeError:
pass
self.imagepath = html_image_path
self.current_docname = ''
self.theme = None # type: ignore[assignment] # no theme necessary
self.templates = None # no template bridge necessary
Expand All @@ -64,13 +81,22 @@ def init(self) -> None:
self.init_css_files()
self.init_js_files()
self.use_index = self.get_builder_config('use_index', 'html')
#
# PJC: New configuration to allow mapping of external links to
# relative Hub links.
link_mappings = None
try:
link_mappings = self.get_builder_config('link_mappings', 'html')
except AttributeError:
pass
self.link_mappings = link_mappings

def get_target_uri(self, docname: str, typ: str | None = None) -> str:
if docname == 'index':
return ''
return ""
if docname.endswith(SEP + 'index'):
return docname[:-5] # up to sep
return docname + SEP
return docname

def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None:
context = context.copy()
Expand All @@ -91,9 +117,24 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p
ctx.setdefault('pathto', lambda p: p)
self.add_sidebars(pagename, ctx)

# Add the toc tree as a JSON dictionary
ctx['toctree'] = html_assists.convert_nav_html_to_json(self._get_local_toctree(pagename))

if not outfilename:
# PJC: Ensure that index files are actually written under the name of the
# directory leafname.
parts = pagename.split(SEP)
if parts[len(parts)-1] == "index":
if len(parts) == 1:
# Use the project name
page_filename = self.get_builder_config('project_name', 'html')
else:
page_filename = SEP.join(parts[:-1])
ctx['current_page_name'] = page_filename
else:
page_filename = pagename
outfilename = path.join(self.outdir,
os_path(pagename) + self.out_suffix)
os_path(page_filename) + self.out_suffix)

# we're not taking the return value here, since no template is
# actually rendered
Expand All @@ -104,6 +145,20 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p
if isinstance(ctx[key], types.FunctionType):
del ctx[key]

if "body" in ctx:
# PJC: Some Linaro documentation has encoded attributes in image ALT text
# which then gets decoded when the HTML is loaded into the DOM, so
# we need to alter it by "escaping" the ampersands with & to
# prevent the decoding.
ctx['body'] = html_assists.escape_encoded_alt_text(ctx['body'])
# PJC: Furthermore, if there is any formatted code with encoded attributes,
# e.g. < changed to &lt; then that also needs to be escaped because it is
# also getting decoded.
ctx['body'] = html_assists.escape_encoded_pre_text(ctx['body'])
# PJC: Go through the body, looking for any <a> tags to see if they
# need to be re-mapped to a local Hub path.
ctx['body'] = html_assists.rewrite_hub_links(ctx['body'], self.link_mappings)

ensuredir(path.dirname(outfilename))
self.dump_context(ctx, outfilename)

Expand Down Expand Up @@ -161,7 +216,7 @@ class JSONHTMLBuilder(SerializingHTMLBuilder):
implementation_dumps_unicode = True
indexer_format = jsonimpl
indexer_dumps_unicode = True
out_suffix = '.fjson'
out_suffix = '.json'
globalcontext_filename = 'globalcontext.json'
searchindex_filename = 'searchindex.json'

Expand Down
127 changes: 127 additions & 0 deletions sphinxcontrib/serializinghtml/html_assists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from bs4 import BeautifulSoup, element
import sys
from html import escape

def clean_href(href: str) -> str:
""" Make sure the href doesn't start or end with a / """
if href[0] == "/":
href = href[1:]
if href[-1] == "/":
href = href[:-1]
return href

def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict:
section_result = []
for child in list_entry.children:
if type(child) is element.Tag and child.name == "li":
section_result.append(convert_tag_to_link(child))
return {
"type": "expandable-link-group",
"text": parent_entry.contents[0].contents[0],
"href": clean_href(parent_entry.contents[0]["href"]),
"items": section_result
}

def convert_tag_to_link(item_entry: element.Tag) -> dict:
# The a tag is a child of the li tag
a_tag = item_entry.contents[0]
return {
"type": "link",
"text": a_tag.contents[0],
"href": clean_href(a_tag["href"])
}

def process_section(result, child, section, pending_divider) -> bool:
if section != []:
# Yes, there is, so we have a sub-section. If we've got some content
# already, add a divider.
if result != []:
result.append({ "type": "divider" })
# Now append the current page and the section links. The
# ul tag is the only child returned, hence [0]
result.append(section_links(child, section[0]))
# If there are any "normal" entries after this section
# add a divider first
pending_divider = True
else:
if pending_divider:
result.append({ "type": "divider" })
pending_divider = False
result.append(convert_tag_to_link(child))
return pending_divider

def process_ul_children(result, ul):
pending_divider = False
for child in ul.children:
if type(child) is element.Tag and child.name == "li":
# Is there a new unordered list within this section?
section = child.find_all("ul", limit=1)
pending_divider = process_section(result, child, section, pending_divider)

def convert_nav_html_to_json(html: str) -> list:
result = []
soup = BeautifulSoup(html, "html.parser")

# Start with the unordered list
ul = soup.ul
# Iterate through list items
while ul is not None:
process_ul_children(result, ul)
while True:
ul = ul.next_sibling
if ul is None or type(ul) is element.Tag:
break
# Not an acceptable type - loop and get the next sibling
return result

def escape_encoded_alt_text(html: str) -> str:
edited = False
soup = BeautifulSoup(html, "html.parser")
images = soup.find_all('img')
for img in images:
if img['alt'] != "":
# At this point, Beautiful Soup has done what a browser does - decode
# any encoded attributes. So we need to re-encode the string, see if
# there are any ampersands and, if so, re-encode them again.
interim = escape(img['alt'])
if interim.find("&") != -1:
img['alt'] = escape(interim)
edited = True

if edited:
html = str(soup)
return html

def escape_encoded_pre_text(html: str) -> str:
edited = False
soup = BeautifulSoup(html, "html.parser")
spans = soup.find_all('span', class_="pre")
for span in spans:
# At this point, Beautiful Soup has done what a browser does - decode
# any encoded attributes. So we need to re-encode the string, see if
# there are any ampersands and, if so, re-encode them again.
interim = escape(span.string)
if interim.find("&") != -1:
span.string = escape(interim)
edited = True

if edited:
html = str(soup)
return html

def rewrite_hub_links(html: str, link_mappings: dict) -> str:
edited = False
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all('a')
for link in links:
for key in link_mappings:
if link['href'].startswith(key):
# We have a match, so replace the href with the new one
link['href'] = link['href'].replace(key, link_mappings[key])
# We also have to remove ".html" from the end of the link
link['href'] = link['href'].replace(".html", "")
edited = True

if edited:
html = str(soup)
return html
Loading