Merge pull request #1085 from sphinx-contrib/introduce-scb-manifest

manifest: introduction of a manifest file
sphinx-contrib · Feb 1, 2025 · 2ee87fe · 2ee87fe
2 parents 8b6228d + d4da934
commit 2ee87fe
Show file tree

Hide file tree

Showing 4 changed files with 252 additions and 0 deletions.
diff --git a/doc/configuration.rst b/doc/configuration.rst
@@ -2095,6 +2095,22 @@ Advanced processing configuration
 
     .. versionadded:: 2.10
 
+.. confval:: confluence_manifest_data
+
+    A manifest file (``scb-manifest.json``) is generated after each run
+    into the output directory. This information includes built pages as
+    well as attachments for these pages. Each page/attachment provides a
+    path to where the content resides. However, if a user wishes to
+    include this data into the manifest, this option can be used to
+    Base64-encode page/attachment data into the manifest. By default, this
+    is disabled:
+
+    .. code-block:: python
+
+        confluence_manifest_data = True
+
+    .. versionadded:: 2.10
+
 .. index:: Mentions; Configuration
 
 .. _confluence_mentions:

diff --git a/sphinxcontrib/confluencebuilder/__init__.py b/sphinxcontrib/confluencebuilder/__init__.py
@@ -244,6 +244,8 @@ def setup(app):
     cm.add_conf('confluence_link_suffix', 'confluence')
     # Enable raw math output for MathJax support
     cm.add_conf_bool('confluence_mathjax', 'confluence')
+    # Embed page/attachment data into the manifest
+    cm.add_conf_bool('confluence_manifest_data')
     # Mappings for documentation mentions to Confluence keys.
     cm.add_conf('confluence_mentions', 'confluence')
     # Inject navigational hints into the documentation.

diff --git a/sphinxcontrib/confluencebuilder/builder.py b/sphinxcontrib/confluencebuilder/builder.py
@@ -20,6 +20,7 @@
 from sphinxcontrib.confluencebuilder.env import ConfluenceCacheInfo
 from sphinxcontrib.confluencebuilder.intersphinx import build_intersphinx
 from sphinxcontrib.confluencebuilder.logger import ConfluenceLogger
+from sphinxcontrib.confluencebuilder.manifest import ConfluenceManifest
 from sphinxcontrib.confluencebuilder.nodes import confluence_footer
 from sphinxcontrib.confluencebuilder.nodes import confluence_header
 from sphinxcontrib.confluencebuilder.nodes import confluence_metadata
@@ -93,6 +94,8 @@ def __init__(self, app, env=None):
         self._original_get_doctree = None
         self._verbose = self.app.verbosity
 
+        self.manifest = ConfluenceManifest(self.config, self.state)
+
         # state tracking is set at initialization (not cleanup) so its content's
         # can be checked/validated on after the builder has executed (testing)
         self.state.reset()
@@ -103,6 +106,10 @@ def init(self):
         apply_defaults(self)
         config = self.config
 
+        # populate desired metadata into the manifest after the configuration
+        # has been finalized
+        self.manifest.register_metadata()
+
         self.add_secnumbers = self.config.confluence_add_secnumbers
         self.secnumber_suffix = self.config.confluence_secnumber_suffix
         self.post_cleanup = config.confluence_cleanup_purge or \
@@ -476,6 +483,9 @@ def write_doc(self, docname, doctree):
                         file.write(self.writer.output)
             except OSError as err:
                 self.warn(f'error writing file {out_file}: {err}')
+            else:
+                self.manifest.add_page(
+                    docname, self.writer.output, out_file, self.out_dir)
 
         self._cache_info.track_page_hash(docname)
 
@@ -866,6 +876,20 @@ def to_asset_name(asset):
 
             self.publish_cleanup()
             self.publish_finalize()
+        else:
+            assets = self.assets.build()
+
+        # track all referenced assets into the manifest
+        for asset in assets:
+            key, abs_file, mime, hash_, docname = asset
+            self.manifest.add_attachment(
+                docname, key, mime, hash_, Path(abs_file), self.out_dir)
+
+        # output the manifest into the output directory
+        self.info('building manifest...', nonl=(not self._verbose))
+        self.manifest.export(self.out_dir)
+        if not self._verbose:
+            self.info(' done')
 
         # persist cache from this run
         self._cache_info.save_cache()

diff --git a/sphinxcontrib/confluencebuilder/manifest.py b/sphinxcontrib/confluencebuilder/manifest.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: BSD-2-Clause
+# Copyright Sphinx Confluence Builder Contributors (AUTHORS)
+
+from base64 import b64encode
+from datetime import datetime
+from datetime import timezone
+from docutils import __version__ as docutils_version
+from pathlib import Path
+from sphinx import __version__ as sphinx_version
+from sphinx.config import Config
+from sphinxcontrib.confluencebuilder.state import ConfluenceState
+from sphinxcontrib.confluencebuilder.util import ConfluenceUtil
+from typing import Any
+import json
+import os
+
+
+class ConfluenceManifest:
+    def __init__(self, config: Config, state: ConfluenceState):
+        """
+        a confluence manifest
+
+        A manifest is generated after a build. It can be used to inform a
+        user or other tooling what pages/attachments have been processed along
+        what page titles and detected hierarchy is expected (if any).
+
+        While this can be used for informational purposes, this information
+        can also be used by third-party tooling to take generated Confluence
+        information and perform publishing in their own manner (e.g. users
+        with an air-gapped environment or needing some sort of publish
+        separation due to authentication considerations). Note that while
+        this extension can generate a manifest, there is no tooling provided
+        to use the manifest in a way to publish.
+
+        Args:
+            config: the active configuration
+            state: this extension's runtime state tracking
+        """
+        self.config = config
+        self.state = state
+
+        self.data = {
+            'type': 'SphinxConfluenceBuilder/Manifest',
+            'spec': 1,
+        }
+
+    def register_metadata(self) -> None:
+        """
+        register metadata into the tracked manifest
+
+        When invoked, this call will populate various metadata into the
+        manifest cache from the resolved configuration (e.g. project version).
+        """
+
+        cfg = self.config
+
+        if cfg.project and cfg.project != 'Project name not set':
+            self.data['project'] = cfg.project
+
+        if cfg.release:
+            self.data['release'] = cfg.release
+
+        if cfg.version:
+            self.data['version'] = cfg.version
+
+        if cfg.author and cfg.author != 'Author name not set':
+            self.data['author'] = cfg.author
+
+        if cfg.copyright:
+            self.data['copyright'] = cfg.copyright
+
+        if cfg.language:
+            self.data['language'] = cfg.language
+
+        if self.config.confluence_manifest_data:
+            self.data['includesData'] = True
+
+    def add_page(self, docname: str, output: str,
+            out_file: Path, out_dir: Path) -> None:
+        """
+        add a page into the manifest
+
+        For any page that is built, this call is used to track it into the
+        manifest cache. This includes using the docname as a page identifier
+        and includes information such as the expected title for a page.
+
+        Args:
+            docname: the docname
+            output: the raw output for a page
+            out_file: the relative path to the built page
+            out_dir: the base folder for any output data
+        """
+
+        title = self.state.title(docname)
+
+        entry: dict[str, Any] = {
+            'id': docname,
+            'title': title,
+        }
+
+        is_root_doc = self.config.root_doc == docname
+        if is_root_doc:
+            entry['isRoot'] = True
+
+        parent_docname = self.state.parent_docname(docname)
+        if parent_docname:
+            parent_title = self.state.title(parent_docname)
+
+            entry['parentId'] = parent_docname
+            entry['parentTitle'] = parent_title
+
+        entry.update({
+            'hash': {
+                # Note that this hash will be of the contents with LF
+                # line endings. For output generated on Windows, the
+                # hash here will not explicit match the hash of the file.
+                # This is fine as this hash is mainly to help identify
+                # the uniqueness of the content.
+                'sha256': ConfluenceUtil.hash(output),
+            },
+            'path': self._resolve_path(out_file, out_dir),
+        })
+
+        if self.config.confluence_manifest_data:
+            entry['data'] = b64encode(output.encode('utf-8')).decode()
+
+        pages = self.data.setdefault('pages', [])
+        pages.append(entry)  # type: ignore [attr-defined]
+
+    def add_attachment(self, docname: str, key: str, mime: str, hash_: str,
+            path: Path, out_dir: Path) -> None:
+        """
+        add an attachment into the manifest
+
+        For any attachment that is processed, this call is used to track it
+        into the manifest cache. This includes using the expected attachment
+        name, the page that should hold the attachment and more.
+
+        Args:
+            docname: the docname that should hold this attachment
+            key: the identifier to use for an attachment on publish
+            mime: the media type of the attachment
+            hash_: the hash of the attachment
+            path: the relative path to the attachment
+            out_dir: the base folder for any output data
+        """
+
+        title = self.state.title(docname)
+
+        entry = {
+            'id': key,
+            'pageId': docname,
+            'pageTitle': title,
+            'mimeType': mime,
+            'hash': {
+                'sha256': hash_,
+            },
+            'path': self._resolve_path(path, out_dir),
+        }
+
+        if self.config.confluence_manifest_data:
+            with path.open('rb') as fp:
+                entry['data'] = b64encode(fp.read()).decode()
+
+        attachments = self.data.setdefault('attachments', [])
+        attachments.append(entry)  # type: ignore [attr-defined]
+
+    def export(self, out_dir: Path) -> None:
+        """
+        export the manifest content
+
+        When an export is requested, the contents will be published into
+        a `scb-manifest.json` file into the project's output directory.
+
+        Args:
+            out_dir: the folder to output the manifest into
+        """
+
+        from sphinxcontrib.confluencebuilder import __version__ as scb_version
+        self.data.update({
+            'confluencebuilderVersion': scb_version,
+            'sphinxVersion': sphinx_version,
+            'docutilsVersion': docutils_version,
+            'generated': datetime.now(timezone.utc).isoformat(),
+        })
+
+        manifest_path = out_dir / 'scb-manifest.json'
+        with manifest_path.open('w') as fp:
+            json.dump(self.data, fp, indent=4)
+
+    def _resolve_path(self, path: Path, base: Path) -> str:
+        """
+        resolve a page/attachment path based off a base path
+
+        We attempt to provide a path in the manifest if tooling wishes to
+        reference/use a given page/attachment file. The path will be relative
+        to the output directory.
+
+        Note that it is possible for an attachment to exist outside of the
+        output directory.
+
+        Args:
+            path: the path of the file
+            base: the output directory to be relative to
+
+        Returns:
+            the relative path
+        """
+
+        return str(Path(os.path.relpath(path, base)).as_posix())