mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
845 lines
35 KiB
Python
845 lines
35 KiB
Python
import glob
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import textwrap
|
|
from importlib.metadata import metadata, requires
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import click
|
|
from pydantic import Field
|
|
from pydantic.dataclasses import dataclass
|
|
|
|
from datahub.configuration.common import ConfigModel
|
|
from datahub.ingestion.api.decorators import (
|
|
CapabilitySetting,
|
|
SourceCapability,
|
|
SupportStatus,
|
|
)
|
|
from datahub.ingestion.source.source_registry import source_registry
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class FieldRow:
|
|
path: str
|
|
parent: Optional[str]
|
|
type_name: str
|
|
required: bool
|
|
default: str
|
|
description: str
|
|
inner_fields: List["FieldRow"] = Field(default_factory=list)
|
|
|
|
def get_checkbox(self) -> str:
|
|
if self.required:
|
|
if not self.parent: # None and empty string both count
|
|
return "✅"
|
|
else:
|
|
return f"❓ (required if {self.parent} is set)"
|
|
else:
|
|
return ""
|
|
|
|
def to_md_line(self) -> str:
|
|
return (
|
|
f"| {self.path} | {self.get_checkbox()} | {self.type_name} | {self.description} | {self.default} |\n"
|
|
+ "".join([inner_field.to_md_line() for inner_field in self.inner_fields])
|
|
)
|
|
|
|
|
|
class FieldHeader(FieldRow):
|
|
def to_md_line(self) -> str:
|
|
return "\n".join(
|
|
[
|
|
"| Field | Required | Type | Description | Default |",
|
|
"| --- | --- | --- | --- | -- |",
|
|
"",
|
|
]
|
|
)
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
|
|
def get_definition_dict_from_definition(
|
|
definitions_dict: Dict[str, Any], definition_name: str
|
|
) -> Dict[str, Any]:
|
|
import re
|
|
|
|
m = re.search("#/definitions/(.*)$", definition_name)
|
|
if m:
|
|
definition_term: str = m.group(1)
|
|
definition_dict = definitions_dict[definition_term]
|
|
return definition_dict
|
|
|
|
raise Exception("Failed to find a definition for " + definition_name)
|
|
|
|
|
|
def get_prefixed_name(field_prefix: Optional[str], field_name: Optional[str]) -> str:
|
|
assert (
|
|
field_prefix or field_name
|
|
), "One of field_prefix or field_name should be present"
|
|
return (
|
|
f"{field_prefix}.{field_name}" # type: ignore
|
|
if field_prefix and field_name
|
|
else field_name
|
|
if not field_prefix
|
|
else field_prefix
|
|
)
|
|
|
|
|
|
def gen_md_table_from_struct(schema_dict: Dict[str, Any]) -> List[str]:
|
|
table_md_str: List[FieldRow] = []
|
|
# table_md_str = [
|
|
# "<table>\n<tr>\n<td>\nField\n</td>Type<td>Default</td><td>Description</td></tr>\n"
|
|
# ]
|
|
gen_md_table(schema_dict, schema_dict.get("definitions", {}), md_str=table_md_str)
|
|
# table_md_str.append("\n</table>\n")
|
|
|
|
table_md_str = [field for field in table_md_str if len(field.inner_fields) == 0] + [
|
|
field for field in table_md_str if len(field.inner_fields) > 0
|
|
]
|
|
|
|
# table_md_str.sort(key=lambda x: "z" if len(x.inner_fields) else "" + x.path)
|
|
return (
|
|
[FieldHeader().to_md_line()]
|
|
+ [row.to_md_line() for row in table_md_str]
|
|
+ ["\n"]
|
|
)
|
|
|
|
|
|
def get_enum_description(
|
|
authored_description: Optional[str], enum_symbols: List[str]
|
|
) -> str:
|
|
description = authored_description or ""
|
|
missed_symbols = [symbol for symbol in enum_symbols if symbol not in description]
|
|
if missed_symbols:
|
|
description = (
|
|
(description + "." if description else "")
|
|
+ " Allowed symbols are "
|
|
+ ", ".join(enum_symbols)
|
|
)
|
|
|
|
return description
|
|
|
|
|
|
def gen_md_table(
|
|
field_dict: Dict[str, Any],
|
|
definitions_dict: Dict[str, Any],
|
|
md_str: List[FieldRow],
|
|
field_prefix: str = None,
|
|
) -> None:
|
|
if "enum" in field_dict:
|
|
md_str.append(
|
|
FieldRow(
|
|
path=get_prefixed_name(field_prefix, None),
|
|
parent=field_prefix,
|
|
type_name="Enum",
|
|
required=field_dict.get("required") or False,
|
|
description=f"one of {','.join(field_dict['enum'])}",
|
|
default=str(field_dict.get("default", "None")),
|
|
)
|
|
)
|
|
|
|
elif "properties" in field_dict:
|
|
for field_name, value in field_dict["properties"].items():
|
|
required_field: bool = field_name in field_dict.get("required", [])
|
|
|
|
if "allOf" in value:
|
|
for sub_schema in value["allOf"]:
|
|
reference = sub_schema["$ref"]
|
|
def_dict = get_definition_dict_from_definition(
|
|
definitions_dict, reference
|
|
)
|
|
# special case for enum reference, we don't split up the rows
|
|
if "enum" in def_dict:
|
|
row = FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"enum({reference.split('/')[-1]})",
|
|
description=get_enum_description(
|
|
value.get("description"), def_dict["enum"]
|
|
),
|
|
default=str(value.get("default", "")),
|
|
required=required_field,
|
|
)
|
|
md_str.append(row)
|
|
else:
|
|
# object reference
|
|
row = FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"{reference.split('/')[-1]} (see below for fields)",
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "")),
|
|
required=required_field,
|
|
)
|
|
md_str.append(row)
|
|
# md_str.append(
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | {reference.split('/')[-1]} (see below for fields) | {value.get('description') or ''} | {value.get('default') or ''} | \n"
|
|
# )
|
|
gen_md_table(
|
|
def_dict,
|
|
definitions_dict,
|
|
field_prefix=get_prefixed_name(field_prefix, field_name),
|
|
md_str=row.inner_fields,
|
|
)
|
|
elif "type" in value and value["type"] == "enum":
|
|
# enum
|
|
enum_definition = value["allOf"][0]["$ref"]
|
|
def_dict = get_definition_dict_from_definition(
|
|
definitions_dict, enum_definition
|
|
)
|
|
print(value)
|
|
print(def_dict)
|
|
md_str.append(
|
|
FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name="Enum",
|
|
description=f"one of {','.join(def_dict['enum'])}",
|
|
required=required_field,
|
|
default=str(value.get("default", "None")),
|
|
)
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | Enum | one of {','.join(def_dict['enum'])} | {def_dict['type']} | \n"
|
|
)
|
|
|
|
elif "type" in value and value["type"] == "object":
|
|
# struct
|
|
if "$ref" not in value:
|
|
if (
|
|
"additionalProperties" in value
|
|
and "$ref" in value["additionalProperties"]
|
|
):
|
|
value_ref = value["additionalProperties"]["$ref"]
|
|
def_dict = get_definition_dict_from_definition(
|
|
definitions_dict, value_ref
|
|
)
|
|
|
|
row = FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"Dict[str, {value_ref.split('/')[-1]}]",
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "")),
|
|
required=required_field,
|
|
)
|
|
md_str.append(row)
|
|
gen_md_table(
|
|
def_dict,
|
|
definitions_dict,
|
|
field_prefix=get_prefixed_name(
|
|
field_prefix, f"{field_name}.`key`"
|
|
),
|
|
md_str=row.inner_fields,
|
|
)
|
|
else:
|
|
value_type = value.get("additionalProperties", {}).get("type")
|
|
md_str.append(
|
|
FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"Dict[str,{value_type}]"
|
|
if value_type
|
|
else "Dict",
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "")),
|
|
required=required_field,
|
|
)
|
|
)
|
|
else:
|
|
object_definition = value["$ref"]
|
|
row = FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"{object_definition.split('/')[-1]} (see below for fields)",
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "")),
|
|
required=required_field,
|
|
)
|
|
|
|
md_str.append(
|
|
row
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | {object_definition.split('/')[-1]} (see below for fields) | {value.get('description') or ''} | {value.get('default') or ''} | \n"
|
|
)
|
|
def_dict = get_definition_dict_from_definition(
|
|
definitions_dict, object_definition
|
|
)
|
|
gen_md_table(
|
|
def_dict,
|
|
definitions_dict,
|
|
field_prefix=get_prefixed_name(field_prefix, field_name),
|
|
md_str=row.inner_fields,
|
|
)
|
|
elif "type" in value and value["type"] == "array":
|
|
# array
|
|
items_type = value["items"].get("type", "object")
|
|
md_str.append(
|
|
FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"Array of {items_type}",
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "None")),
|
|
required=required_field,
|
|
)
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | Array of {items_type} | {value.get('description') or ''} | {value.get('default')} | \n"
|
|
)
|
|
# TODO: Array of structs
|
|
elif "type" in value:
|
|
md_str.append(
|
|
FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=value["type"],
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "None")),
|
|
required=required_field,
|
|
)
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | {value['type']} | {value.get('description') or ''} | {value.get('default')} | \n"
|
|
)
|
|
elif "$ref" in value:
|
|
object_definition = value["$ref"]
|
|
def_dict = get_definition_dict_from_definition(
|
|
definitions_dict, object_definition
|
|
)
|
|
row = FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name=f"{object_definition.split('/')[-1]} (see below for fields)",
|
|
description=value.get("description") or "",
|
|
default=str(value.get("default", "")),
|
|
required=required_field,
|
|
)
|
|
|
|
md_str.append(
|
|
row
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | {object_definition.split('/')[-1]} (see below for fields) | {value.get('description') or ''} | {value.get('default') or ''} | \n"
|
|
)
|
|
gen_md_table(
|
|
def_dict,
|
|
definitions_dict,
|
|
field_prefix=get_prefixed_name(field_prefix, field_name),
|
|
md_str=row.inner_fields,
|
|
)
|
|
else:
|
|
# print(md_str, field_prefix, field_name, value)
|
|
md_str.append(
|
|
FieldRow(
|
|
path=get_prefixed_name(field_prefix, field_name),
|
|
parent=field_prefix,
|
|
type_name="Generic dict",
|
|
description=value.get("description", ""),
|
|
default=str(value.get("default", "None")),
|
|
required=required_field,
|
|
)
|
|
# f"| {get_prefixed_name(field_prefix, field_name)} | Any dict | {value.get('description') or ''} | {value.get('default')} |\n"
|
|
)
|
|
|
|
|
|
def get_snippet(long_string: str, max_length: int = 100) -> str:
|
|
snippet = ""
|
|
if len(long_string) > max_length:
|
|
snippet = long_string[:max_length].strip() + "... "
|
|
else:
|
|
snippet = long_string.strip()
|
|
|
|
snippet = snippet.replace("\n", " ")
|
|
snippet = snippet.strip() + " "
|
|
return snippet
|
|
|
|
|
|
def get_support_status_badge(support_status: SupportStatus) -> str:
|
|
if support_status == SupportStatus.CERTIFIED:
|
|
return ""
|
|
if support_status == SupportStatus.INCUBATING:
|
|
return ""
|
|
if support_status == SupportStatus.TESTING:
|
|
return ""
|
|
|
|
return ""
|
|
|
|
|
|
def get_capability_supported_badge(supported: bool) -> str:
|
|
return "✅" if supported else "❌"
|
|
|
|
|
|
def get_capability_text(src_capability: SourceCapability) -> str:
|
|
"""
|
|
Returns markdown format cell text for a capability, hyperlinked to capability feature page if known
|
|
"""
|
|
capability_docs_mapping: Dict[SourceCapability, str] = {
|
|
SourceCapability.DELETION_DETECTION: "../../../../metadata-ingestion/docs/dev_guides/stateful.md#stale-entity-removal",
|
|
SourceCapability.DOMAINS: "../../../domains.md",
|
|
SourceCapability.PLATFORM_INSTANCE: "../../../platform-instances.md",
|
|
SourceCapability.DATA_PROFILING: "../../../../metadata-ingestion/docs/dev_guides/sql_profiles.md",
|
|
}
|
|
|
|
capability_doc = capability_docs_mapping.get(src_capability)
|
|
return (
|
|
src_capability.value
|
|
if not capability_doc
|
|
else f"[{src_capability.value}]({capability_doc})"
|
|
)
|
|
|
|
|
|
def create_or_update(
|
|
something: Dict[Any, Any], path: List[str], value: Any
|
|
) -> Dict[Any, Any]:
|
|
dict_under_operation = something
|
|
for p in path[:-1]:
|
|
if p not in dict_under_operation:
|
|
dict_under_operation[p] = {}
|
|
dict_under_operation = dict_under_operation[p]
|
|
|
|
dict_under_operation[path[-1]] = value
|
|
return something
|
|
|
|
|
|
def does_extra_exist(extra_name: str) -> bool:
|
|
for key, value in metadata("acryl-datahub").items():
|
|
if key == "Provides-Extra" and value == extra_name:
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_additional_deps_for_extra(extra_name: str) -> List[str]:
|
|
all_requirements = requires("acryl-datahub") or []
|
|
# filter for base dependencies
|
|
base_deps = set([x.split(";")[0] for x in all_requirements if "extra ==" not in x])
|
|
# filter for dependencies for this extra
|
|
extra_deps = set(
|
|
[x.split(";")[0] for x in all_requirements if f"extra == '{extra_name}'" in x]
|
|
)
|
|
# calculate additional deps that this extra adds
|
|
delta_deps = extra_deps - base_deps
|
|
return list(delta_deps)
|
|
|
|
|
|
def relocate_path(orig_path: str, relative_path: str, relocated_path: str) -> str:
|
|
|
|
newPath = os.path.join(os.path.dirname(orig_path), relative_path)
|
|
assert os.path.exists(newPath)
|
|
|
|
newRelativePath = os.path.relpath(newPath, os.path.dirname(relocated_path))
|
|
return newRelativePath
|
|
|
|
|
|
def rewrite_markdown(file_contents: str, path: str, relocated_path: str) -> str:
|
|
def new_url(original_url: str, file_path: str) -> str:
|
|
if original_url.startswith(("http://", "https://", "#")):
|
|
return original_url
|
|
import pathlib
|
|
|
|
file_ext = pathlib.Path(original_url).suffix
|
|
if file_ext.startswith(".md"):
|
|
return original_url
|
|
elif file_ext in [".png", ".svg", ".gif", ".pdf"]:
|
|
new_url = relocate_path(path, original_url, relocated_path)
|
|
return new_url
|
|
return original_url
|
|
|
|
# Look for the [text](url) syntax. Note that this will also capture images.
|
|
#
|
|
# We do a little bit of parenthesis matching here to account for parens in URLs.
|
|
# See https://stackoverflow.com/a/17759264 for explanation of the second capture group.
|
|
new_content = re.sub(
|
|
r"\[(.*?)\]\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)",
|
|
lambda x: f"[{x.group(1)}]({new_url(x.group(2).strip(),path)})", # type: ignore
|
|
file_contents,
|
|
)
|
|
|
|
new_content = re.sub(
|
|
# Also look for the [text]: url syntax.
|
|
r"^\[(.+?)\]\s*:\s*(.+?)\s*$",
|
|
lambda x: f"[{x.group(1)}]: {new_url(x.group(2), path)}",
|
|
new_content,
|
|
)
|
|
return new_content
|
|
|
|
|
|
@click.command()
|
|
@click.option("--out-dir", type=str, required=True)
|
|
@click.option("--extra-docs", type=str, required=False)
|
|
@click.option("--source", type=str, required=False)
|
|
def generate(
|
|
out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None
|
|
) -> None: # noqa: C901
|
|
source_documentation: Dict[str, Any] = {}
|
|
metrics = {}
|
|
metrics["source_platforms"] = {"discovered": 0, "generated": 0, "warnings": []}
|
|
metrics["plugins"] = {"discovered": 0, "generated": 0, "failed": 0}
|
|
|
|
if extra_docs:
|
|
for path in glob.glob(f"{extra_docs}/**/*[.md|.yaml|.yml]", recursive=True):
|
|
|
|
m = re.search("/docs/sources/(.*)/(.*).md", path)
|
|
if m:
|
|
platform_name = m.group(1).lower()
|
|
file_name = m.group(2)
|
|
destination_md: str = (
|
|
f"../docs/generated/ingestion/sources/{platform_name}.md"
|
|
)
|
|
|
|
with open(path, "r") as doc_file:
|
|
file_contents = doc_file.read()
|
|
final_markdown = rewrite_markdown(
|
|
file_contents, path, destination_md
|
|
)
|
|
|
|
if file_name == "README":
|
|
# README goes as platform level docs
|
|
# all other docs are assumed to be plugin level
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_name, "custom_docs"],
|
|
final_markdown,
|
|
)
|
|
else:
|
|
if "_" in file_name:
|
|
plugin_doc_parts = file_name.split("_")
|
|
if len(plugin_doc_parts) != 2 or plugin_doc_parts[
|
|
1
|
|
] not in ["pre", "post"]:
|
|
raise Exception(
|
|
f"{file_name} needs to be of the form <plugin>_pre.md or <plugin>_post.md"
|
|
)
|
|
|
|
docs_key_name = f"custom_docs_{plugin_doc_parts[1]}"
|
|
create_or_update(
|
|
source_documentation,
|
|
[
|
|
platform_name,
|
|
"plugins",
|
|
plugin_doc_parts[0],
|
|
docs_key_name,
|
|
],
|
|
final_markdown,
|
|
)
|
|
else:
|
|
create_or_update(
|
|
source_documentation,
|
|
[
|
|
platform_name,
|
|
"plugins",
|
|
file_name,
|
|
"custom_docs_post",
|
|
],
|
|
final_markdown,
|
|
)
|
|
else:
|
|
yml_match = re.search("/docs/sources/(.*)/(.*)_recipe.yml", path)
|
|
if yml_match:
|
|
platform_name = yml_match.group(1).lower()
|
|
plugin_name = yml_match.group(2)
|
|
with open(path, "r") as doc_file:
|
|
file_contents = doc_file.read()
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_name, "plugins", plugin_name, "recipe"],
|
|
file_contents,
|
|
)
|
|
|
|
for plugin_name in sorted(source_registry.mapping.keys()):
|
|
if source and source != plugin_name:
|
|
continue
|
|
|
|
metrics["plugins"]["discovered"] = metrics["plugins"]["discovered"] + 1
|
|
# We want to attempt to load all plugins before printing a summary.
|
|
source_type = None
|
|
try:
|
|
# output = subprocess.check_output(
|
|
# ["/bin/bash", "-c", f"pip install -e '.[{key}]'"]
|
|
# )
|
|
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
|
|
if isinstance(class_or_exception, Exception):
|
|
raise class_or_exception
|
|
logger.debug(f"Processing {plugin_name}")
|
|
source_type = source_registry.get(plugin_name)
|
|
logger.debug(f"Source class is {source_type}")
|
|
extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
|
|
extra_deps = (
|
|
get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"Failed to process {plugin_name} due to exception {e}", exc_info=e
|
|
)
|
|
metrics["plugins"]["failed"] = metrics["plugins"].get("failed", 0) + 1
|
|
|
|
if source_type and hasattr(source_type, "get_config_class"):
|
|
try:
|
|
source_config_class: ConfigModel = source_type.get_config_class()
|
|
support_status = SupportStatus.UNKNOWN
|
|
capabilities = []
|
|
if hasattr(source_type, "__doc__"):
|
|
source_doc = textwrap.dedent(source_type.__doc__ or "")
|
|
if hasattr(source_type, "get_platform_name"):
|
|
platform_name = source_type.get_platform_name()
|
|
else:
|
|
platform_name = (
|
|
plugin_name.title()
|
|
) # we like platform names to be human readable
|
|
|
|
if hasattr(source_type, "get_platform_id"):
|
|
platform_id = source_type.get_platform_id()
|
|
|
|
if hasattr(source_type, "get_platform_doc_order"):
|
|
platform_doc_order = source_type.get_platform_doc_order()
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "doc_order"],
|
|
platform_doc_order,
|
|
)
|
|
|
|
source_documentation[platform_id] = (
|
|
source_documentation.get(platform_id) or {}
|
|
)
|
|
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "classname"],
|
|
".".join([source_type.__module__, source_type.__name__]),
|
|
)
|
|
plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
|
|
if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
|
|
plugin_file_name = plugin_file_name + "/__init__.py"
|
|
else:
|
|
plugin_file_name = plugin_file_name + ".py"
|
|
if os.path.exists(plugin_file_name):
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "filename"],
|
|
plugin_file_name,
|
|
)
|
|
else:
|
|
logger.info(
|
|
f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}"
|
|
)
|
|
|
|
if hasattr(source_type, "get_support_status"):
|
|
support_status = source_type.get_support_status()
|
|
|
|
if hasattr(source_type, "get_capabilities"):
|
|
capabilities = list(source_type.get_capabilities())
|
|
capabilities.sort(key=lambda x: x.capability.value)
|
|
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "capabilities"],
|
|
capabilities,
|
|
)
|
|
|
|
create_or_update(
|
|
source_documentation, [platform_id, "name"], platform_name
|
|
)
|
|
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "extra_deps"],
|
|
extra_deps,
|
|
)
|
|
|
|
config_dir = f"{out_dir}/config_schemas"
|
|
os.makedirs(config_dir, exist_ok=True)
|
|
with open(f"{config_dir}/{plugin_name}_config.json", "w") as f:
|
|
f.write(source_config_class.schema_json(indent=2))
|
|
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "config_schema"],
|
|
source_config_class.schema_json(indent=2) or "",
|
|
)
|
|
|
|
table_md = gen_md_table_from_struct(source_config_class.schema())
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "source_doc"],
|
|
source_doc or "",
|
|
)
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "config"],
|
|
table_md,
|
|
)
|
|
create_or_update(
|
|
source_documentation,
|
|
[platform_id, "plugins", plugin_name, "support_status"],
|
|
support_status,
|
|
)
|
|
|
|
except Exception as e:
|
|
raise e
|
|
|
|
sources_dir = f"{out_dir}/sources"
|
|
os.makedirs(sources_dir, exist_ok=True)
|
|
|
|
i = 0
|
|
for platform_id, platform_docs in sorted(
|
|
source_documentation.items(),
|
|
key=lambda x: (x[1]["name"].casefold(), x[1]["name"])
|
|
if "name" in x[1]
|
|
else (x[0].casefold(), x[0]),
|
|
):
|
|
if source and platform_id != source:
|
|
continue
|
|
metrics["source_platforms"]["discovered"] = (
|
|
metrics["source_platforms"]["discovered"] + 1
|
|
)
|
|
platform_doc_file = f"{sources_dir}/{platform_id}.md"
|
|
if "name" not in platform_docs:
|
|
# We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes
|
|
warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?"
|
|
logger.error(warning_msg)
|
|
metrics["source_platforms"]["warnings"].append(warning_msg)
|
|
continue
|
|
|
|
with open(platform_doc_file, "w") as f:
|
|
i += 1
|
|
f.write(f"---\nsidebar_position: {i}\n---\n\n")
|
|
f.write(
|
|
f"import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n"
|
|
)
|
|
f.write(f"# {platform_docs['name']}\n")
|
|
|
|
if len(platform_docs["plugins"].keys()) > 1:
|
|
# More than one plugin used to provide integration with this platform
|
|
f.write(
|
|
f"There are {len(platform_docs['plugins'].keys())} sources that provide integration with {platform_docs['name']}\n"
|
|
)
|
|
f.write("\n")
|
|
f.write("<table>\n")
|
|
f.write("<tr>")
|
|
for col_header in ["Source Module", "Documentation"]:
|
|
f.write(f"<td>{col_header}</td>")
|
|
f.write("</tr>")
|
|
|
|
# f.write("| Source Module | Documentation |\n")
|
|
# f.write("| ------ | ---- |\n")
|
|
for plugin, plugin_docs in sorted(
|
|
platform_docs["plugins"].items(),
|
|
key=lambda x: str(x[1].get("doc_order"))
|
|
if x[1].get("doc_order")
|
|
else x[0],
|
|
):
|
|
f.write("<tr>\n")
|
|
f.write(f"<td>\n\n`{plugin}`\n\n</td>\n")
|
|
f.write(
|
|
f"<td>\n\n\n{platform_docs['plugins'][plugin].get('source_doc') or ''} [Read more...](#module-{plugin})\n\n\n</td>\n"
|
|
)
|
|
f.write("</tr>\n")
|
|
# f.write(
|
|
# f"| `{plugin}` | {get_snippet(platform_docs['plugins'][plugin]['source_doc'])}[Read more...](#module-{plugin}) |\n"
|
|
# )
|
|
f.write("</table>\n\n")
|
|
# insert platform level custom docs before plugin section
|
|
f.write(platform_docs.get("custom_docs") or "")
|
|
# all_plugins = platform_docs["plugins"].keys()
|
|
|
|
for plugin, plugin_docs in sorted(
|
|
platform_docs["plugins"].items(),
|
|
key=lambda x: str(x[1].get("doc_order"))
|
|
if x[1].get("doc_order")
|
|
else x[0],
|
|
):
|
|
f.write(f"\n\n## Module `{plugin}`\n")
|
|
if "support_status" in plugin_docs:
|
|
f.write(
|
|
get_support_status_badge(plugin_docs["support_status"]) + "\n\n"
|
|
)
|
|
if "capabilities" in plugin_docs and len(plugin_docs["capabilities"]):
|
|
f.write("\n### Important Capabilities\n")
|
|
f.write("| Capability | Status | Notes |\n")
|
|
f.write("| ---------- | ------ | ----- |\n")
|
|
plugin_capabilities: List[CapabilitySetting] = plugin_docs[
|
|
"capabilities"
|
|
]
|
|
for cap_setting in plugin_capabilities:
|
|
f.write(
|
|
f"| {get_capability_text(cap_setting.capability)} | {get_capability_supported_badge(cap_setting.supported)} | {cap_setting.description} |\n"
|
|
)
|
|
f.write("\n")
|
|
|
|
f.write(f"{plugin_docs.get('source_doc') or ''}\n")
|
|
# Insert custom pre section
|
|
f.write(plugin_docs.get("custom_docs_pre", ""))
|
|
f.write("\n### CLI based Ingestion\n")
|
|
if "extra_deps" in plugin_docs:
|
|
f.write("\n#### Install the Plugin\n")
|
|
if plugin_docs["extra_deps"] != []:
|
|
f.write("```shell\n")
|
|
f.write(f"pip install 'acryl-datahub[{plugin}]'\n")
|
|
f.write("```\n")
|
|
else:
|
|
f.write(
|
|
f"The `{plugin}` source works out of the box with `acryl-datahub`.\n"
|
|
)
|
|
if "recipe" in plugin_docs:
|
|
f.write("\n### Starter Recipe\n")
|
|
f.write(
|
|
"Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options.\n\n\n"
|
|
)
|
|
f.write(
|
|
"For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes).\n"
|
|
)
|
|
f.write("```yaml\n")
|
|
f.write(plugin_docs["recipe"])
|
|
f.write("\n```\n")
|
|
if "config" in plugin_docs:
|
|
f.write("\n### Config Details\n")
|
|
f.write(
|
|
"""<Tabs>
|
|
<TabItem value="options" label="Options" default>\n\n"""
|
|
)
|
|
f.write(
|
|
"Note that a `.` is used to denote nested fields in the YAML recipe.\n\n"
|
|
)
|
|
f.write(
|
|
"\n<details open>\n<summary>View All Configuration Options</summary>\n\n"
|
|
)
|
|
for doc in plugin_docs["config"]:
|
|
f.write(doc)
|
|
f.write("\n</details>\n\n")
|
|
f.write(
|
|
f"""</TabItem>
|
|
<TabItem value="schema" label="Schema">
|
|
|
|
The [JSONSchema](https://json-schema.org/) for this configuration is inlined below.\n\n
|
|
```javascript
|
|
{plugin_docs['config_schema']}
|
|
```\n\n
|
|
</TabItem>
|
|
</Tabs>\n\n"""
|
|
)
|
|
# insert custom plugin docs after config details
|
|
f.write(plugin_docs.get("custom_docs_post", ""))
|
|
if "classname" in plugin_docs:
|
|
f.write("\n### Code Coordinates\n")
|
|
f.write(f"- Class Name: `{plugin_docs['classname']}`\n")
|
|
if "filename" in plugin_docs:
|
|
f.write(
|
|
f"- Browse on [GitHub](../../../../metadata-ingestion/{plugin_docs['filename']})\n\n"
|
|
)
|
|
metrics["plugins"]["generated"] = metrics["plugins"]["generated"] + 1
|
|
|
|
f.write("\n## Questions\n")
|
|
f.write(
|
|
f"If you've got any questions on configuring ingestion for {platform_docs.get('name',platform_id)}, feel free to ping us on [our Slack](https://slack.datahubproject.io)\n"
|
|
)
|
|
metrics["source_platforms"]["generated"] = (
|
|
metrics["source_platforms"]["generated"] + 1
|
|
)
|
|
print("Ingestion Documentation Generation Complete")
|
|
print("############################################")
|
|
print(json.dumps(metrics, indent=2))
|
|
print("############################################")
|
|
if metrics["plugins"].get("failed", 0) > 0:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logger.setLevel("INFO")
|
|
generate()
|