From 7ea39ad0b039fdefb54440f83df6d26dd66afbe8 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 9 Jun 2025 18:50:31 +0900 Subject: [PATCH] test build on sphinx-markdown-builder --- docs-website/sphinx/Makefile | 6 +- docs-website/sphinx/apidocs/sdk.rst | 9 + docs-website/sphinx/conf.py | 2 + .../sphinx/convert_sphinx_to_docusaurus.py | 89 +++------ docs-website/sphinx/requirements.txt | 1 + docs-website/src/styles/sphinx.scss | 171 +++++++----------- 6 files changed, 106 insertions(+), 172 deletions(-) create mode 100644 docs-website/sphinx/apidocs/sdk.rst diff --git a/docs-website/sphinx/Makefile b/docs-website/sphinx/Makefile index e8c419f991..14083acee2 100644 --- a/docs-website/sphinx/Makefile +++ b/docs-website/sphinx/Makefile @@ -26,10 +26,10 @@ $(VENV_SENTINEL): requirements.txt # Not using Python's http.server because it enables caching headers. serve: - serve -p 3001 _build/html/ + serve -p 3001 _build/markdown/ -md: html - $(VENV_DIR)/bin/python3 convert_sphinx_to_docusaurus.py +md: + @$(SPHINXBUILD) -M markdown "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) && $(VENV_DIR)/bin/python3 convert_sphinx_to_docusaurus.py # Route other targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). diff --git a/docs-website/sphinx/apidocs/sdk.rst b/docs-website/sphinx/apidocs/sdk.rst new file mode 100644 index 0000000000..579be1be2b --- /dev/null +++ b/docs-website/sphinx/apidocs/sdk.rst @@ -0,0 +1,9 @@ +DataHub SDK +======= + +The DataHub SDK is a Python library for interacting with the DataHub platform. + +.. automodule:: datahub.sdk.search_client + +.. automodule:: datahub.sdk.lineage_client + diff --git a/docs-website/sphinx/conf.py b/docs-website/sphinx/conf.py index 49cd20d5ef..276fdb1904 100644 --- a/docs-website/sphinx/conf.py +++ b/docs-website/sphinx/conf.py @@ -26,8 +26,10 @@ extensions = [ "sphinx_autodoc_typehints", # This enables us to autogenerate docs for our CLI. "sphinx_click", + "sphinx_markdown_builder", ] +markdown_anchor_sections = True napoleon_use_param = True # Move type hint info to function description instead of signature diff --git a/docs-website/sphinx/convert_sphinx_to_docusaurus.py b/docs-website/sphinx/convert_sphinx_to_docusaurus.py index 891e08d8c7..ef9e2c7cc6 100644 --- a/docs-website/sphinx/convert_sphinx_to_docusaurus.py +++ b/docs-website/sphinx/convert_sphinx_to_docusaurus.py @@ -1,78 +1,39 @@ import pathlib -import json -from bs4 import BeautifulSoup - SPHINX_ROOT_DIR = pathlib.Path(".") -SPHINX_BUILD_DIR = SPHINX_ROOT_DIR / pathlib.Path("_build/html/apidocs") +SPHINX_BUILD_DIR = SPHINX_ROOT_DIR / pathlib.Path("_build/markdown/apidocs") DOCS_OUTPUT_DIR = pathlib.Path("../docs/python-sdk") -def html_to_mdx(html: str) -> str: - # Because the HTML uses `class` and has `{}` in it, it isn't valid - # MDX. As such, we use React's dangerouslySetInnerHTML. - return f""" - -
- -""" - - -def bs4_to_mdx(soup: BeautifulSoup) -> str: - # TODO: Eventually we should do something smarter here to - # generate something that's closer to real Markdown. This would - # be helpful, for example, for enabling Docusaurus to generate - # a table of contents for the page. - return html_to_mdx(str(soup)) - - -def convert_html_to_md(html_file: pathlib.Path) -> str: - html = html_file.read_text() - soup = BeautifulSoup(html, "html.parser") - - body = soup.find("main").find("div", {"class": "bd-article-container"}) - article = body.find("article") - - # Remove all the "permalink to this heading" links. - for link in article.find_all("a", {"class": "headerlink"}): - link.decompose() - - # Remove the trailing " – " from arguments that are missing - # a description. - for item in article.select("dl.field-list dd p"): - # Note - that's U+2013, not a normal hyphen. - if str(item).endswith(" – "): - parent = item.parent - # print("orig item", item) - new_item = BeautifulSoup(str(item)[:-7] + "", "html.parser") - # print("new-item", str(new_item)) - parent.p.replace_with(new_item) - # print("item post replace", parent) - - # Extract title from the h1. - title_element = article.find("h1") - title = title_element.text - title_element.decompose() - - # TODO - generate nicer slugs for these pages - md_meta = f"""--- -title: {title} ----\n\n""" - - return md_meta + bs4_to_mdx(article) - - def main(): DOCS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - for doc in SPHINX_BUILD_DIR.glob("**/*.html"): - md = convert_html_to_md(doc) - - outfile = DOCS_OUTPUT_DIR / doc.relative_to(SPHINX_BUILD_DIR).with_suffix(".md") + for doc in SPHINX_BUILD_DIR.glob("**/*.md"): + outfile = DOCS_OUTPUT_DIR / doc.relative_to(SPHINX_BUILD_DIR) outfile.parent.mkdir(parents=True, exist_ok=True) - outfile.write_text(md) - print(f"Generated {outfile}") + with open(doc, "r") as f: + content = f.read() + + # Replace dangerous characters + replacements = [ + ("