sphinx-doc · pcolmer · Jul 31, 2024 · Jul 31, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/README.rst b/README.rst
@@ -1,22 +1,5 @@
-=============================
-sphinxcontrib-serializinghtml
-=============================
+This is a fork of https://github.com/sphinx-doc/sphinxcontrib-serializinghtml
 
-sphinxcontrib-serializinghtml is a sphinx extension which outputs
-"serialized" HTML files (json and pickle).
+Changes made to this fork are to facilitate the creation of JSON files suitable for consumption by React.
 
-For more details, please visit http://www.sphinx-doc.org/.
-
-Installing
-==========
-
-Install from PyPI::
-
-   pip install -U sphinxcontrib-serializinghtml
-
-Contributing
-============
-
-See `CONTRIBUTING.rst`__
-
-.. __: https://github.com/sphinx-doc/sphinx/blob/master/CONTRIBUTING.rst
+Since those changes are very specific, they have not been contributed back to the original repo.
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ classifiers = [
     "Topic :: Text Processing",
     "Topic :: Utilities",
 ]
-dependencies = []
+dependencies = ["beautifulsoup4"]
 dynamic = ["version"]
 
 [project.optional-dependencies]

diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py
@@ -11,7 +11,7 @@
 from sphinx.locale import get_translation
 from sphinx.util.osutil import SEP, copyfile, ensuredir, os_path
 
-from sphinxcontrib.serializinghtml import jsonimpl
+from sphinxcontrib.serializinghtml import html_assists, jsonimpl
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ...
         def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ...
         def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ...
 
-__version__ = '2.0.0'
+__version__ = '2.0.0+Linaro-241028'
 __version_info__ = (2, 0, 0)
 
 package_dir = path.abspath(path.dirname(__file__))
@@ -55,7 +55,24 @@ class SerializingHTMLBuilder(StandaloneHTMLBuilder):
 
     def init(self) -> None:
         self.build_info = BuildInfo(self.config, self.tags)
-        self.imagedir = '_images'
+        # Cope with whether or not Sphinx has the required configuration variables
+        # set.
+        # See HTML Builder comments for explanation of image setup & handling
+        html_image_dir = None
+        try:
+            html_image_dir = self.get_builder_config('image_dir', 'html')
+        except AttributeError:
+            pass
+        if html_image_dir is not None:
+            self.imagedir = html_image_dir
+        else:
+            self.imagedir = '_images'
+        html_image_path = None
+        try:
+            html_image_path = self.get_builder_config('image_path', 'html')
+        except AttributeError:
+            pass
+        self.imagepath = html_image_path
         self.current_docname = ''
         self.theme = None  # type: ignore[assignment] # no theme necessary
         self.templates = None  # no template bridge necessary
@@ -64,13 +81,22 @@ def init(self) -> None:
         self.init_css_files()
         self.init_js_files()
         self.use_index = self.get_builder_config('use_index', 'html')
+        #
+        # PJC: New configuration to allow mapping of external links to
+        # relative Hub links.
+        link_mappings = None
+        try:
+            link_mappings = self.get_builder_config('link_mappings', 'html')
+        except AttributeError:
+            pass
+        self.link_mappings = link_mappings
 
     def get_target_uri(self, docname: str, typ: str | None = None) -> str:
         if docname == 'index':
-            return ''
+            return ""
         if docname.endswith(SEP + 'index'):
             return docname[:-5]  # up to sep
-        return docname + SEP
+        return docname
 
     def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None:
         context = context.copy()
@@ -91,9 +117,24 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p
         ctx.setdefault('pathto', lambda p: p)
         self.add_sidebars(pagename, ctx)
 
+        # Add the toc tree as a JSON dictionary
+        ctx['toctree'] = html_assists.convert_nav_html_to_json(self._get_local_toctree(pagename))
+
         if not outfilename:
+            # PJC: Ensure that index files are actually written under the name of the
+            #      directory leafname.
+            parts = pagename.split(SEP)
+            if parts[len(parts)-1] == "index":
+                if len(parts) == 1:
+                    # Use the project name
+                    page_filename = self.get_builder_config('project_name', 'html')
+                else:
+                    page_filename = SEP.join(parts[:-1])
+                ctx['current_page_name'] = page_filename
+            else:
+                page_filename = pagename
             outfilename = path.join(self.outdir,
-                                    os_path(pagename) + self.out_suffix)
+                                    os_path(page_filename) + self.out_suffix)
 
         # we're not taking the return value here, since no template is
         # actually rendered
@@ -104,6 +145,20 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p
             if isinstance(ctx[key], types.FunctionType):
                 del ctx[key]
 
+        if "body" in ctx:
+            # PJC: Some Linaro documentation has encoded attributes in image ALT text
+            # which then gets decoded when the HTML is loaded into the DOM, so
+            # we need to alter it by "escaping" the ampersands with &amp; to
+            # prevent the decoding.
+            ctx['body'] = html_assists.escape_encoded_alt_text(ctx['body'])
+            # PJC: Furthermore, if there is any formatted code with encoded attributes,
+            # e.g. < changed to &lt; then that also needs to be escaped because it is
+            # also getting decoded.
+            ctx['body'] = html_assists.escape_encoded_pre_text(ctx['body'])
+            # PJC: Go through the body, looking for any <a> tags to see if they
+            # need to be re-mapped to a local Hub path.
+            ctx['body'] = html_assists.rewrite_hub_links(ctx['body'], self.link_mappings)
+
         ensuredir(path.dirname(outfilename))
         self.dump_context(ctx, outfilename)
 
@@ -161,7 +216,7 @@ class JSONHTMLBuilder(SerializingHTMLBuilder):
     implementation_dumps_unicode = True
     indexer_format = jsonimpl
     indexer_dumps_unicode = True
-    out_suffix = '.fjson'
+    out_suffix = '.json'
     globalcontext_filename = 'globalcontext.json'
     searchindex_filename = 'searchindex.json'
 

diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py
@@ -0,0 +1,127 @@
+from bs4 import BeautifulSoup, element
+import sys
+from html import escape
+
+def clean_href(href: str) -> str:
+    """ Make sure the href doesn't start or end with a / """
+    if href[0] == "/":
+        href = href[1:]
+    if href[-1] == "/":
+        href = href[:-1]
+    return href
+
+def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict:
+    section_result = []
+    for child in list_entry.children:
+        if type(child) is element.Tag and child.name == "li":
+            section_result.append(convert_tag_to_link(child))
+    return {
+                "type": "expandable-link-group",
+                "text": parent_entry.contents[0].contents[0],
+                "href": clean_href(parent_entry.contents[0]["href"]),
+                "items": section_result
+            }
+
+def convert_tag_to_link(item_entry: element.Tag) -> dict:
+    # The a tag is a child of the li tag
+    a_tag = item_entry.contents[0]
+    return {
+            "type": "link",
+            "text": a_tag.contents[0],
+            "href": clean_href(a_tag["href"])
+        }
+
+def process_section(result, child, section, pending_divider) -> bool:
+    if section != []:
+                # Yes, there is, so we have a sub-section. If we've got some content
+                # already, add a divider.
+        if result != []:
+            result.append({ "type": "divider" })
+                # Now append the current page and the section links. The
+                # ul tag is the only child returned, hence [0]
+        result.append(section_links(child, section[0]))
+                # If there are any "normal" entries after this section
+                # add a divider first
+        pending_divider = True
+    else:
+        if pending_divider:
+            result.append({ "type": "divider" })
+            pending_divider = False
+        result.append(convert_tag_to_link(child))
+    return pending_divider
+
+def process_ul_children(result, ul):
+    pending_divider = False
+    for child in ul.children:
+        if type(child) is element.Tag and child.name == "li":
+            # Is there a new unordered list within this section?
+            section = child.find_all("ul", limit=1)
+            pending_divider = process_section(result, child, section, pending_divider)
+
+def convert_nav_html_to_json(html: str) -> list:
+    result = []
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Start with the unordered list
+    ul = soup.ul
+    # Iterate through list items
+    while ul is not None:
+        process_ul_children(result, ul)
+        while True:
+            ul = ul.next_sibling
+            if ul is None or type(ul) is element.Tag:
+                break
+            # Not an acceptable type - loop and get the next sibling
+    return result
+
+def escape_encoded_alt_text(html: str) -> str:
+    edited = False
+    soup = BeautifulSoup(html, "html.parser")
+    images = soup.find_all('img')
+    for img in images:
+        if img['alt'] != "":
+            # At this point, Beautiful Soup has done what a browser does - decode
+            # any encoded attributes. So we need to re-encode the string, see if
+            # there are any ampersands and, if so, re-encode them again.
+            interim = escape(img['alt'])
+            if interim.find("&") != -1:
+                img['alt'] = escape(interim)
+                edited = True
+
+    if edited:
+        html = str(soup)
+    return html
+
+def escape_encoded_pre_text(html: str) -> str:
+    edited = False
+    soup = BeautifulSoup(html, "html.parser")
+    spans = soup.find_all('span', class_="pre")
+    for span in spans:
+        # At this point, Beautiful Soup has done what a browser does - decode
+        # any encoded attributes. So we need to re-encode the string, see if
+        # there are any ampersands and, if so, re-encode them again.
+        interim = escape(span.string)
+        if interim.find("&") != -1:
+            span.string = escape(interim)
+            edited = True
+
+    if edited:
+        html = str(soup)
+    return html
+
+def rewrite_hub_links(html: str, link_mappings: dict) -> str:
+    edited = False
+    soup = BeautifulSoup(html, "html.parser")
+    links = soup.find_all('a')
+    for link in links:
+        for key in link_mappings:
+            if link['href'].startswith(key):
+                # We have a match, so replace the href with the new one
+                link['href'] = link['href'].replace(key, link_mappings[key])
+                # We also have to remove ".html" from the end of the link
+                link['href'] = link['href'].replace(".html", "")
+                edited = True
+
+    if edited:
+        html = str(soup)
+    return html