From 900d7fe24461a142a8fdcf5bf1840ce9fc65b628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sun, 28 Sep 2025 11:36:37 +0200 Subject: [PATCH] docs: hide pydantic_removed_field marked fields from documentation (#14829) Co-authored-by: Claude --- metadata-ingestion/scripts/docgen.py | 6 +-- .../scripts/docs_config_table.py | 44 ++++++++++++++++--- .../configuration/validate_field_removal.py | 3 ++ 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py index a14d7ae172..bf6b6be175 100644 --- a/metadata-ingestion/scripts/docgen.py +++ b/metadata-ingestion/scripts/docgen.py @@ -12,7 +12,7 @@ from typing import Dict, List, Optional import click from docgen_types import Platform, Plugin -from docs_config_table import gen_md_table_from_json_schema +from docs_config_table import gen_md_table_from_pydantic from datahub.configuration.common import ConfigModel from datahub.ingestion.api.decorators import ( @@ -244,8 +244,8 @@ def create_plugin_from_capability_data( source_config_class: ConfigModel = source_type.get_config_class() plugin.config_json_schema = source_config_class.schema_json(indent=2) - plugin.config_md = gen_md_table_from_json_schema( - source_config_class.schema(), current_source=plugin_name + plugin.config_md = gen_md_table_from_pydantic( + source_config_class, current_source=plugin_name ) # Write the config json schema to the out_dir. diff --git a/metadata-ingestion/scripts/docs_config_table.py b/metadata-ingestion/scripts/docs_config_table.py index 9f21779525..d636f5a5ca 100644 --- a/metadata-ingestion/scripts/docs_config_table.py +++ b/metadata-ingestion/scripts/docs_config_table.py @@ -1,7 +1,7 @@ import html import json import re -from typing import Any, ClassVar, Dict, Iterable, List, Optional, Type +from typing import Any, ClassVar, Dict, Iterable, List, Optional, Set, Type from pydantic import BaseModel, Field @@ -345,12 +345,40 @@ def priority_value(path: str) -> str: return "A" +def _get_removed_fields_from_model(model_class: Type[BaseModel]) -> set: + """Extract fields marked as removed via pydantic_removed_field from a Pydantic model""" + removed_fields = set() + + # Check pre-root validators for removal markers + if hasattr(model_class, "__pre_root_validators__"): + for validator in model_class.__pre_root_validators__: + removed_field = getattr(validator, "_doc_removed_field", None) + if removed_field is not None: + removed_fields.add(removed_field) + + return removed_fields + + +def _is_removed_field(field_name: str, removed_fields: Optional[Set[str]]) -> bool: + """Check if a field is marked as removed""" + return field_name in removed_fields if removed_fields else False + + def should_hide_field( - schema_field: SchemaFieldClass, current_source: str, schema_dict: Dict[str, Any] + schema_field: SchemaFieldClass, + current_source: str, + schema_dict: Dict[str, Any], + removed_fields: Optional[Set[str]] = None, ) -> bool: """Check if field should be hidden for the current source""" + # Extract field name from the path field_name = schema_field.fieldPath.split(".")[-1] + + # Hide removed fields + if _is_removed_field(field_name, removed_fields): + return True + for ends_with in [ "pattern.[type=array].allow", "pattern.[type=array].allow.[type=string].string", @@ -380,9 +408,12 @@ def should_hide_field( def gen_md_table_from_json_schema( - schema_dict: Dict[str, Any], current_source: Optional[str] = None + schema_dict: Dict[str, Any], + current_source: Optional[str] = None, + removed_fields: Optional[Set[str]] = None, ) -> str: # we don't want default field values to be injected into the description of the field + JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict)) result: List[str] = [FieldHeader().to_md_line()] @@ -390,7 +421,9 @@ def gen_md_table_from_json_schema( field_tree = FieldTree(field=None) for field in schema_fields: row: FieldRow = FieldRow.from_schema_field(field) - if current_source and should_hide_field(field, current_source, schema_dict): + if current_source and should_hide_field( + field, current_source, schema_dict, removed_fields + ): continue field_tree.add_field(row) @@ -408,7 +441,8 @@ def gen_md_table_from_json_schema( def gen_md_table_from_pydantic( model: Type[BaseModel], current_source: Optional[str] = None ) -> str: - return gen_md_table_from_json_schema(model.schema(), current_source) + removed_fields = _get_removed_fields_from_model(model) + return gen_md_table_from_json_schema(model.schema(), current_source, removed_fields) if __name__ == "__main__": diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_removal.py b/metadata-ingestion/src/datahub/configuration/validate_field_removal.py index 80b11c8e03..0433730dc7 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_removal.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_removal.py @@ -24,6 +24,9 @@ def pydantic_removed_field( values.pop(field) return values + # Mark the function as handling a removed field for doc generation + _validate_field_removal._doc_removed_field = field # type: ignore[attr-defined] + # Hack: Pydantic maintains unique list of validators by referring its __name__. # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264 # This hack ensures that multiple field removals do not overwrite each other.