2023-03-29 08:53:20 +05:30
|
|
|
|
import pathlib
|
|
|
|
|
import json
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SPHINX_ROOT_DIR = pathlib.Path(".")
|
|
|
|
|
SPHINX_BUILD_DIR = SPHINX_ROOT_DIR / pathlib.Path("_build/html/apidocs")
|
|
|
|
|
DOCS_OUTPUT_DIR = pathlib.Path("../docs/python-sdk")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def html_to_mdx(html: str) -> str:
|
|
|
|
|
# Because the HTML uses `class` and has `{}` in it, it isn't valid
|
|
|
|
|
# MDX. As such, we use React's dangerouslySetInnerHTML.
|
|
|
|
|
return f"""
|
|
|
|
|
|
|
|
|
|
<div dangerouslySetInnerHTML={{{{__html: {json.dumps(html)}}}}}></div>
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def bs4_to_mdx(soup: BeautifulSoup) -> str:
|
|
|
|
|
# TODO: Eventually we should do something smarter here to
|
|
|
|
|
# generate something that's closer to real Markdown. This would
|
|
|
|
|
# be helpful, for example, for enabling Docusaurus to generate
|
|
|
|
|
# a table of contents for the page.
|
|
|
|
|
return html_to_mdx(str(soup))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_html_to_md(html_file: pathlib.Path) -> str:
|
|
|
|
|
html = html_file.read_text()
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
|
|
|
|
body = soup.find("main").find("div", {"class": "bd-article-container"})
|
|
|
|
|
article = body.find("article")
|
|
|
|
|
|
|
|
|
|
# Remove all the "permalink to this heading" links.
|
|
|
|
|
for link in article.find_all("a", {"class": "headerlink"}):
|
|
|
|
|
link.decompose()
|
|
|
|
|
|
|
|
|
|
# Remove the trailing " – " from arguments that are missing
|
|
|
|
|
# a description.
|
|
|
|
|
for item in article.select("dl.field-list dd p"):
|
|
|
|
|
# Note - that's U+2013, not a normal hyphen.
|
|
|
|
|
if str(item).endswith(" – </p>"):
|
|
|
|
|
parent = item.parent
|
|
|
|
|
# print("orig item", item)
|
|
|
|
|
new_item = BeautifulSoup(str(item)[:-7] + "</p>", "html.parser")
|
|
|
|
|
# print("new-item", str(new_item))
|
|
|
|
|
parent.p.replace_with(new_item)
|
|
|
|
|
# print("item post replace", parent)
|
|
|
|
|
|
|
|
|
|
# Extract title from the h1.
|
|
|
|
|
title_element = article.find("h1")
|
|
|
|
|
title = title_element.text
|
|
|
|
|
title_element.decompose()
|
|
|
|
|
|
|
|
|
|
# TODO - generate nicer slugs for these pages
|
|
|
|
|
md_meta = f"""---
|
|
|
|
|
title: {title}
|
|
|
|
|
---\n\n"""
|
|
|
|
|
|
2023-04-07 13:28:31 -05:00
|
|
|
|
return md_meta + bs4_to_mdx(article)
|
2023-03-29 08:53:20 +05:30
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
DOCS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
for doc in SPHINX_BUILD_DIR.glob("**/*.html"):
|
|
|
|
|
md = convert_html_to_md(doc)
|
|
|
|
|
|
|
|
|
|
outfile = DOCS_OUTPUT_DIR / doc.relative_to(SPHINX_BUILD_DIR).with_suffix(".md")
|
|
|
|
|
outfile.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
outfile.write_text(md)
|
|
|
|
|
|
|
|
|
|
print(f"Generated {outfile}")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|