diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
index 4d9c5ba7ec..fe0068949d 100644
--- a/docs-website/sidebars.js
+++ b/docs-website/sidebars.js
@@ -746,11 +746,26 @@ module.exports = {
items: [
"metadata-ingestion/as-a-library",
{
- "Python SDK Reference": [
+ type: "category",
+ label: "SDK Reference",
+ items: [
{
- type: "autogenerated",
- dirName: "python-sdk",
+ type: "category",
+ label: "Builder",
+ items: [{ type: "autogenerated", dirName: "python-sdk/builder" }],
},
+ {
+ type: "category",
+ label: "Clients",
+ items: [{ type: "autogenerated", dirName: "python-sdk/clients" }],
+ },
+ {
+ type: "category",
+ label: "SDK V2",
+ items: [{ type: "autogenerated", dirName: "python-sdk/sdk-v2" }],
+ },
+ "python-sdk/models",
+ "python-sdk/urns",
],
},
],
diff --git a/docs-website/sphinx/Makefile b/docs-website/sphinx/Makefile
index e8c419f991..626c08026f 100644
--- a/docs-website/sphinx/Makefile
+++ b/docs-website/sphinx/Makefile
@@ -26,10 +26,10 @@ $(VENV_SENTINEL): requirements.txt
# Not using Python's http.server because it enables caching headers.
serve:
- serve -p 3001 _build/html/
+ serve -p 3001 _build/markdown/
-md: html
- $(VENV_DIR)/bin/python3 convert_sphinx_to_docusaurus.py
+md: venv
+ @$(SPHINXBUILD) -M markdown "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) && $(VENV_DIR)/bin/python3 convert_sphinx_to_docusaurus.py
# Route other targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
diff --git a/docs-website/sphinx/apidocs/builder.rst b/docs-website/sphinx/apidocs/builder.rst
deleted file mode 100644
index 820817264d..0000000000
--- a/docs-website/sphinx/apidocs/builder.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Builder
-=======
-
-These classes and methods make it easier to construct MetadataChangeProposals and MetadataChangeEvents.
-
-.. automodule:: datahub.emitter.mcp
-
-.. automodule:: datahub.emitter.mce_builder
-
-.. automodule:: datahub.emitter.mcp_builder
\ No newline at end of file
diff --git a/docs-website/sphinx/apidocs/builder/mce-builder.rst b/docs-website/sphinx/apidocs/builder/mce-builder.rst
new file mode 100644
index 0000000000..4c457557ac
--- /dev/null
+++ b/docs-website/sphinx/apidocs/builder/mce-builder.rst
@@ -0,0 +1,7 @@
+MCE Builder
+=======
+
+These classes and methods make it easier to construct MetadataChangeEvents.
+
+.. automodule:: datahub.emitter.mce_builder
+ :member-order: alphabetical
diff --git a/docs-website/sphinx/apidocs/builder/mcp-builder.rst b/docs-website/sphinx/apidocs/builder/mcp-builder.rst
new file mode 100644
index 0000000000..99690364d4
--- /dev/null
+++ b/docs-website/sphinx/apidocs/builder/mcp-builder.rst
@@ -0,0 +1,10 @@
+MCP Builder
+=======
+
+These classes and methods make it easier to construct MetadataChangeProposals.
+
+.. automodule:: datahub.emitter.mcp
+ :member-order: alphabetical
+
+.. automodule:: datahub.emitter.mcp_builder
+ :member-order: alphabetical
\ No newline at end of file
diff --git a/docs-website/sphinx/apidocs/clients.rst b/docs-website/sphinx/apidocs/clients.rst
deleted file mode 100644
index 0ceacaa68e..0000000000
--- a/docs-website/sphinx/apidocs/clients.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Client
-======
-
-The Kafka emitter or Rest emitter can be used to push metadata to DataHub.
-The DataHub graph client extends the Rest emitter with additional functionality.
-
-.. automodule:: datahub.emitter.rest_emitter
-
-.. automodule:: datahub.emitter.kafka_emitter
-
-.. automodule:: datahub.ingestion.graph.client
diff --git a/docs-website/sphinx/apidocs/clients/graph-client.rst b/docs-website/sphinx/apidocs/clients/graph-client.rst
new file mode 100644
index 0000000000..7f39aae6ed
--- /dev/null
+++ b/docs-website/sphinx/apidocs/clients/graph-client.rst
@@ -0,0 +1,8 @@
+Graph Client
+======
+
+The DataHub graph client extends the Rest emitter with additional functionality.
+
+.. automodule:: datahub.ingestion.graph.client
+ :member-order: alphabetical
+
diff --git a/docs-website/sphinx/apidocs/clients/kafka-emitter.rst b/docs-website/sphinx/apidocs/clients/kafka-emitter.rst
new file mode 100644
index 0000000000..d3467f6d44
--- /dev/null
+++ b/docs-website/sphinx/apidocs/clients/kafka-emitter.rst
@@ -0,0 +1,8 @@
+Kafka Emitter
+======
+
+The Kafka emitter can be used to push metadata to DataHub.
+
+.. automodule:: datahub.emitter.kafka_emitter
+ :member-order: alphabetical
+
diff --git a/docs-website/sphinx/apidocs/clients/rest-emitter.rst b/docs-website/sphinx/apidocs/clients/rest-emitter.rst
new file mode 100644
index 0000000000..1ca4252cd4
--- /dev/null
+++ b/docs-website/sphinx/apidocs/clients/rest-emitter.rst
@@ -0,0 +1,8 @@
+Rest Emitter
+======
+
+The Rest emitter can be used to push metadata to DataHub.
+
+.. automodule:: datahub.emitter.rest_emitter
+ :member-order: alphabetical
+
diff --git a/docs-website/sphinx/apidocs/sdk-v2/entities.rst b/docs-website/sphinx/apidocs/sdk-v2/entities.rst
new file mode 100644
index 0000000000..4eb6ba0747
--- /dev/null
+++ b/docs-website/sphinx/apidocs/sdk-v2/entities.rst
@@ -0,0 +1,29 @@
+Entities
+=======
+
+The DataHub SDK provides a set of entities that can be used to interact with DataHub's metadata.
+
+
+.. automodule:: datahub.sdk.dataset
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.container
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.mlmodel
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.mlmodelgroup
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.dashboard
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.chart
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.datajob
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.dataflow
+ :member-order: alphabetical
\ No newline at end of file
diff --git a/docs-website/sphinx/apidocs/sdk-v2/entity-client.rst b/docs-website/sphinx/apidocs/sdk-v2/entity-client.rst
new file mode 100644
index 0000000000..2db2166559
--- /dev/null
+++ b/docs-website/sphinx/apidocs/sdk-v2/entity-client.rst
@@ -0,0 +1,7 @@
+Entity Client
+=======
+
+The DataHub Entity Client provides a client for interacting with DataHub entities.
+
+.. automodule:: datahub.sdk.entity_client
+ :member-order: alphabetical
diff --git a/docs-website/sphinx/apidocs/sdk-v2/lineage-client.rst b/docs-website/sphinx/apidocs/sdk-v2/lineage-client.rst
new file mode 100644
index 0000000000..068343f09a
--- /dev/null
+++ b/docs-website/sphinx/apidocs/sdk-v2/lineage-client.rst
@@ -0,0 +1,7 @@
+Lineage Client
+=======
+
+The DataHub Lineage Client provides a client for searching and retrieving lineage metadata from DataHub.
+
+.. automodule:: datahub.sdk.lineage_client
+ :member-order: alphabetical
diff --git a/docs-website/sphinx/apidocs/sdk-v2/main-client.rst b/docs-website/sphinx/apidocs/sdk-v2/main-client.rst
new file mode 100644
index 0000000000..d3fff8deec
--- /dev/null
+++ b/docs-website/sphinx/apidocs/sdk-v2/main-client.rst
@@ -0,0 +1,7 @@
+Main Client
+=======
+
+The DataHub Main Client provides a client for interacting with DataHub.
+
+.. automodule:: datahub.sdk.main_client
+ :member-order: alphabetical
diff --git a/docs-website/sphinx/apidocs/sdk-v2/resolver-client.rst b/docs-website/sphinx/apidocs/sdk-v2/resolver-client.rst
new file mode 100644
index 0000000000..8940417773
--- /dev/null
+++ b/docs-website/sphinx/apidocs/sdk-v2/resolver-client.rst
@@ -0,0 +1,7 @@
+Resolver Client
+=======
+
+The DataHub Resolver Client provides a client for resolving entities by their URN.
+
+.. automodule:: datahub.sdk.resolver_client
+ :member-order: alphabetical
diff --git a/docs-website/sphinx/apidocs/sdk-v2/search-client.rst b/docs-website/sphinx/apidocs/sdk-v2/search-client.rst
new file mode 100644
index 0000000000..ecce9b8182
--- /dev/null
+++ b/docs-website/sphinx/apidocs/sdk-v2/search-client.rst
@@ -0,0 +1,10 @@
+Search Client
+=======
+
+The DataHub Search Client provides a client for searching and retrieving metadata from DataHub.
+
+.. automodule:: datahub.sdk.search_client
+ :member-order: alphabetical
+
+.. automodule:: datahub.sdk.search_filters
+ :member-order: alphabetical
\ No newline at end of file
diff --git a/docs-website/sphinx/cli.rst b/docs-website/sphinx/cli.rst
deleted file mode 100644
index e4b30d6c65..0000000000
--- a/docs-website/sphinx/cli.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-DataHub CLI
-===========
-
-.. click:: datahub.entrypoints:datahub
- :prog: datahub
- :nested: full
diff --git a/docs-website/sphinx/conf.py b/docs-website/sphinx/conf.py
index 49cd20d5ef..276fdb1904 100644
--- a/docs-website/sphinx/conf.py
+++ b/docs-website/sphinx/conf.py
@@ -26,8 +26,10 @@ extensions = [
"sphinx_autodoc_typehints",
# This enables us to autogenerate docs for our CLI.
"sphinx_click",
+ "sphinx_markdown_builder",
]
+markdown_anchor_sections = True
napoleon_use_param = True
# Move type hint info to function description instead of signature
diff --git a/docs-website/sphinx/convert_sphinx_to_docusaurus.py b/docs-website/sphinx/convert_sphinx_to_docusaurus.py
index 891e08d8c7..b4080dcaf2 100644
--- a/docs-website/sphinx/convert_sphinx_to_docusaurus.py
+++ b/docs-website/sphinx/convert_sphinx_to_docusaurus.py
@@ -1,78 +1,166 @@
import pathlib
-import json
-from bs4 import BeautifulSoup
-
+import re
SPHINX_ROOT_DIR = pathlib.Path(".")
-SPHINX_BUILD_DIR = SPHINX_ROOT_DIR / pathlib.Path("_build/html/apidocs")
+SPHINX_BUILD_DIR = SPHINX_ROOT_DIR / "_build/markdown"
DOCS_OUTPUT_DIR = pathlib.Path("../docs/python-sdk")
+HTML_TAGS = {
+ "html", "head", "title", "base", "link", "meta", "style", "script", "noscript",
+ "body", "section", "nav", "article", "aside", "h1", "h2", "h3", "h4", "h5", "h6",
+ "header", "footer", "address", "p", "hr", "pre", "blockquote", "ol", "ul", "li",
+ "dl", "dt", "dd", "figure", "figcaption", "div", "main", "a", "em", "strong",
+ "small", "s", "cite", "q", "dfn", "abbr", "data", "time", "code", "var", "samp",
+ "kbd", "sub", "sup", "i", "b", "u", "mark", "ruby", "rt", "rp", "bdi", "bdo",
+ "span", "br", "wbr", "ins", "del", "img", "iframe", "embed", "object", "param",
+ "video", "audio", "track", "canvas", "map", "area", "svg", "math",
+ "table", "caption", "colgroup", "col", "tbody", "thead", "tfoot", "tr", "td", "th",
+ "form", "fieldset", "legend", "label", "button", "select", "datalist",
+ "optgroup", "option", "textarea", "output", "progress", "meter", "details",
+ "summary", "dialog", "template", "slot", "portal"
+}
-def html_to_mdx(html: str) -> str:
- # Because the HTML uses `class` and has `{}` in it, it isn't valid
- # MDX. As such, we use React's dangerouslySetInnerHTML.
- return f"""
+REPLACEMENTS = [
+ ("\1
',
+ arg_str
+ )
-def bs4_to_mdx(soup: BeautifulSoup) -> str:
- # TODO: Eventually we should do something smarter here to
- # generate something that's closer to real Markdown. This would
- # be helpful, for example, for enabling Docusaurus to generate
- # a table of contents for the page.
- return html_to_mdx(str(soup))
+# ---- ARGUMENT PARSER ----
+def parse_args(arg_str: str) -> str:
+ if not arg_str.strip():
+ return ""
+ parts = []
+ for arg in arg_str.split(","):
+ arg = arg.strip().replace("\\", "")
+ if arg == "*":
+ parts.append("*")
+ continue
-def convert_html_to_md(html_file: pathlib.Path) -> str:
- html = html_file.read_text()
- soup = BeautifulSoup(html, "html.parser")
+ for pattern, template in [
+ (r"([\w_]+)\s*:\s*([^=]+)\s*=\s*(.+)", r'\1: \2 = \3'),
+ (r"([\w_]+)\s*=\s*(.+)", r'\1 = \2'),
+ (r"([\w_]+)\s*:\s*(.+)", r'\1: \2')
+ ]:
+ m = re.match(pattern, arg)
+ if m:
+ parts.append(m.expand(template))
+ break
+ else:
+ parts.append(f'{arg}')
- body = soup.find("main").find("div", {"class": "bd-article-container"})
- article = body.find("article")
+ parsed = ", ".join(parts)
+ parsed = convert_md_link_to_html(parsed)
+ return parsed
- # Remove all the "permalink to this heading" links.
- for link in article.find_all("a", {"class": "headerlink"}):
- link.decompose()
+# ---- HEADING PARSER ----
+def parse_heading(text: str):
+ match = re.match(r"(?:\*class\*\s+)?([\w\.]+)\.([\w]+)(?:\((.*)\))?", text)
+ if match:
+ owner, name, args = match.groups()
+ parsed_args = parse_args(args or "")
+ prefix = 'class ' if "*class*" in text else ""
+ heading = f'{prefix}{owner}.{name}'
+ heading += f"({parsed_args})" if parsed_args else "()"
+ slug = f"{owner}.{name}"
+ return name, heading, slug
- # Remove the trailing " – " from arguments that are missing
- # a description.
- for item in article.select("dl.field-list dd p"):
- # Note - that's U+2013, not a normal hyphen.
- if str(item).endswith(" –