mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-02 03:39:03 +00:00
feat(docs): generate DataHub-optimized entity documentation variant (#15111)
This commit is contained in:
parent
f844ea7b2d
commit
9016a4446f
@ -156,6 +156,9 @@ aspect_registry: Dict[str, AspectDefinition] = {}
|
||||
# A holder for generated docs
|
||||
generated_documentation: Dict[str, str] = {}
|
||||
|
||||
# A holder for generated DataHub-optimized docs (without Docusaurus-specific features)
|
||||
generated_documentation_datahub: Dict[str, str] = {}
|
||||
|
||||
|
||||
# Patch add_name method to NOT complain about duplicate names
|
||||
def add_name(self, name_attr, space_attr, new_schema):
|
||||
@ -505,6 +508,123 @@ def make_relnship_docs(relationships: List[Relationship], direction: str) -> str
|
||||
return doc
|
||||
|
||||
|
||||
def expand_inline_directives(content: str) -> str:
|
||||
"""
|
||||
Expand {{ inline /path/to/file }} directives in markdown content.
|
||||
|
||||
This replicates the behavior of docs-website/generateDocsDir.ts's
|
||||
markdown_process_inline_directives function.
|
||||
|
||||
Args:
|
||||
content: Markdown content potentially containing inline directives
|
||||
|
||||
Returns:
|
||||
Content with inline directives expanded
|
||||
"""
|
||||
|
||||
def replace_inline(match: re.Match) -> str:
|
||||
indent = match.group(1)
|
||||
inline_file_path = match.group(3)
|
||||
|
||||
if not inline_file_path.startswith("/"):
|
||||
raise ValueError(f"inline path must be absolute: {inline_file_path}")
|
||||
|
||||
# Read file from repo root (one level up from metadata-ingestion/scripts)
|
||||
repo_root = Path(__file__).parent.parent.parent
|
||||
referenced_file_path = repo_root / inline_file_path.lstrip("/")
|
||||
|
||||
if not referenced_file_path.exists():
|
||||
logger.warning(f"Inline file not found: {inline_file_path}")
|
||||
return match.group(0)
|
||||
|
||||
logger.debug(f"Inlining {inline_file_path}")
|
||||
referenced_content = referenced_file_path.read_text()
|
||||
|
||||
# Add indentation to each line (don't add "Inlined from" comment - files already have path comments)
|
||||
new_content = "\n".join(
|
||||
f"{indent}{line}" for line in referenced_content.split("\n")
|
||||
)
|
||||
|
||||
return new_content
|
||||
|
||||
# Pattern matches: {{ inline /path/to/file }} or {{ inline /path/to/file show_path_as_comment }}
|
||||
pattern = r"^(\s*){{(\s*)inline\s+(\S+)(\s+)(show_path_as_comment\s+)?(\s*)}}$"
|
||||
return re.sub(pattern, replace_inline, content, flags=re.MULTILINE)
|
||||
|
||||
|
||||
def convert_relative_links_to_absolute(content: str) -> str:
|
||||
"""
|
||||
Convert relative markdown links to absolute docs.datahub.com URLs.
|
||||
|
||||
Relative links don't work properly in DataHub UI, so we convert them to absolute
|
||||
links pointing to the public documentation site.
|
||||
|
||||
Args:
|
||||
content: Markdown content with potential relative links
|
||||
|
||||
Returns:
|
||||
Content with relative .md links converted to absolute URLs
|
||||
"""
|
||||
|
||||
def replace_link(match: re.Match) -> str:
|
||||
link_text = match.group(1)
|
||||
link_url = match.group(2)
|
||||
|
||||
# Skip if already absolute
|
||||
if link_url.startswith("http://") or link_url.startswith("https://"):
|
||||
return match.group(0)
|
||||
|
||||
# Parse anchor (e.g., #section)
|
||||
parts = link_url.split("#")
|
||||
path = parts[0]
|
||||
anchor = "#" + parts[1] if len(parts) > 1 else ""
|
||||
|
||||
# Remove .md extension
|
||||
if path.endswith(".md"):
|
||||
path = path[:-3]
|
||||
|
||||
# Resolve path to absolute form
|
||||
if path.startswith("/docs/"):
|
||||
# Already absolute doc path - use as-is
|
||||
resolved = path
|
||||
elif path.startswith("./"):
|
||||
# Same directory (entities) - remove ./
|
||||
resolved = "/docs/generated/metamodel/entities/" + path[2:]
|
||||
elif path.startswith("../"):
|
||||
# Relative path from entities directory
|
||||
# We're at /docs/generated/metamodel/entities/
|
||||
# Count how many levels to go up
|
||||
up_count = 0
|
||||
temp_path = path
|
||||
while temp_path.startswith("../"):
|
||||
up_count += 1
|
||||
temp_path = temp_path[3:]
|
||||
|
||||
# Start from entities path and go up
|
||||
base_parts = ["docs", "generated", "metamodel", "entities"]
|
||||
base_parts = base_parts[: len(base_parts) - up_count]
|
||||
|
||||
# Add remaining path
|
||||
remaining_parts = temp_path.split("/") if temp_path else []
|
||||
resolved_parts = base_parts + remaining_parts
|
||||
resolved = "/" + "/".join(resolved_parts)
|
||||
elif "/" in path:
|
||||
# Path with directory but no prefix - assume it needs /docs/
|
||||
resolved = "/" + path if path.startswith("docs/") else "/docs/" + path
|
||||
else:
|
||||
# Just a filename - assume same directory (entities)
|
||||
resolved = "/docs/generated/metamodel/entities/" + path
|
||||
|
||||
# Build absolute URL (lowercase for docs.datahub.com compatibility)
|
||||
absolute_url = f"https://docs.datahub.com{resolved.lower()}{anchor}"
|
||||
|
||||
return f"[{link_text}]({absolute_url})"
|
||||
|
||||
# Match markdown links with .md extension: [text](path.md) or [text](path.md#anchor)
|
||||
pattern = r"\[([^\]]+)\]\(([^)]+\.md[^)]*)\)"
|
||||
return re.sub(pattern, replace_link, content)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FieldInfo:
|
||||
name: str
|
||||
@ -893,6 +1013,55 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
|
||||
raise Exception(f"Failed to find information for entity: {entity_name}")
|
||||
|
||||
|
||||
def make_entity_docs_datahub(entity_display_name: str) -> str:
|
||||
"""
|
||||
Generate DataHub-optimized documentation for an entity.
|
||||
|
||||
This variant is simpler than the Docusaurus version:
|
||||
- Expands {{ inline ... }} directives (since DataHub won't process them)
|
||||
- Converts relative markdown links to absolute docs.datahub.com URLs
|
||||
- Excludes technical sections (already available in DataHub's Columns tab)
|
||||
- Removes Docusaurus-specific components (Tabs, TabItem, etc.)
|
||||
|
||||
Args:
|
||||
entity_display_name: Display name of the entity (e.g., "Dataset")
|
||||
|
||||
Returns:
|
||||
Simplified documentation suitable for DataHub ingestion
|
||||
"""
|
||||
entity_name = entity_display_name[0:1].lower() + entity_display_name[1:]
|
||||
entity_def: Optional[EntityDefinition] = entity_registry.get(entity_name)
|
||||
|
||||
if not entity_def:
|
||||
raise Exception(f"Failed to find information for entity: {entity_name}")
|
||||
|
||||
# Get main documentation content (from file or from entity definition)
|
||||
if entity_def.doc_file_contents:
|
||||
doc = entity_def.doc_file_contents
|
||||
elif entity_def.doc:
|
||||
doc = f"# {entity_def.display_name}\n{entity_def.doc}"
|
||||
else:
|
||||
doc = f"# {entity_def.display_name}"
|
||||
|
||||
# Expand {{ inline ... }} directives
|
||||
doc = expand_inline_directives(doc)
|
||||
|
||||
# Convert relative markdown links to absolute URLs
|
||||
doc = convert_relative_links_to_absolute(doc)
|
||||
|
||||
# Add pointer to DataHub's Columns tab for technical details
|
||||
technical_pointer = (
|
||||
"\n\n## Technical Reference\n\n"
|
||||
"For technical details about fields, searchability, and relationships, "
|
||||
"view the **Columns** tab in DataHub.\n"
|
||||
)
|
||||
|
||||
final_doc = doc + technical_pointer
|
||||
|
||||
generated_documentation_datahub[entity_name] = final_doc
|
||||
return final_doc
|
||||
|
||||
|
||||
def create_search_field_name_property() -> List[MetadataChangeProposalWrapper]:
|
||||
"""
|
||||
Create the structured property for documenting search field names.
|
||||
@ -1136,9 +1305,14 @@ def generate_stitched_record(
|
||||
]
|
||||
),
|
||||
DatasetPropertiesClass(
|
||||
description=make_entity_docs(
|
||||
dataset_urn.split(":")[-1].split(",")[1], relnships_graph
|
||||
)
|
||||
description=(
|
||||
# Generate Docusaurus docs (stores in generated_documentation)
|
||||
make_entity_docs(entity_display_name, relnships_graph),
|
||||
# But use DataHub variant for ingestion
|
||||
generated_documentation_datahub.get(
|
||||
entity_name, f"# {entity_display_name}"
|
||||
),
|
||||
)[1] # Select the DataHub variant
|
||||
),
|
||||
SubTypesClass(typeNames=["entity"]),
|
||||
],
|
||||
@ -1355,6 +1529,16 @@ def generate( # noqa: C901
|
||||
logger.info("Creating structured property for search field names")
|
||||
structured_property_mcps = create_search_field_name_property()
|
||||
|
||||
# Generate DataHub-optimized documentation for all entities
|
||||
# This must be done before generate_stitched_record() so the docs are available for MCPs
|
||||
logger.info("Generating DataHub-optimized entity documentation")
|
||||
for entity_name in entity_registry:
|
||||
entity_def = entity_registry[entity_name]
|
||||
try:
|
||||
make_entity_docs_datahub(entity_def.display_name)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate DataHub docs for {entity_name}: {e}")
|
||||
|
||||
relationship_graph = RelationshipGraph()
|
||||
entity_mcps = list(generate_stitched_record(relationship_graph))
|
||||
|
||||
@ -1373,12 +1557,22 @@ def generate( # noqa: C901
|
||||
|
||||
os.makedirs(entity_dir, exist_ok=True)
|
||||
|
||||
# Write Docusaurus variant (with Tabs, technical sections, etc.)
|
||||
with open(f"{entity_dir}/{entity_name}.md", "w") as fp:
|
||||
fp.write("---\n")
|
||||
fp.write(f"sidebar_position: {index}\n")
|
||||
fp.write("---\n")
|
||||
fp.write(generated_documentation[entity_name])
|
||||
index += 1
|
||||
|
||||
# Write DataHub variant (simplified, with inline directives expanded)
|
||||
if entity_name in generated_documentation_datahub:
|
||||
with open(f"{entity_dir}/{entity_name}-datahub.md", "w") as fp:
|
||||
fp.write("---\n")
|
||||
fp.write(f"sidebar_position: {index}\n")
|
||||
fp.write("---\n")
|
||||
fp.write(generated_documentation_datahub[entity_name])
|
||||
|
||||
index += 1
|
||||
|
||||
if file:
|
||||
logger.info(f"Will write events to {file}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user