feat(docs): add showing specific fields to docs of specific connectors (#13810)

2025-06-27 05:03:31 +00:00 · 2025-06-19 15:08:11 +05:30 · 2025-06-19 15:08:11 +05:30 · 85b29c9361
commit 85b29c9361
parent b12b9aa919
3 changed files with 38 additions and 5 deletions
--- a/metadata-ingestion/scripts/docgen.py
+++ b/metadata-ingestion/scripts/docgen.py
@ -198,7 +198,7 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
        source_config_class: ConfigModel = source_type.get_config_class()

        plugin.config_json_schema = source_config_class.schema_json(indent=2)
-        plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema())
+        plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)

        # Write the config json schema to the out_dir.
        config_dir = pathlib.Path(out_dir) / "config_schemas"
--- a/metadata-ingestion/scripts/docs_config_table.py
+++ b/metadata-ingestion/scripts/docs_config_table.py
@ -343,7 +343,26 @@ def priority_value(path: str) -> str:
    return "A"


-def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
+def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str, Any]) -> bool:
+    """Check if field should be hidden for the current source"""
+    # Extract field name from the path
+    field_name = schema_field.fieldPath.split('.')[-1]
+    
+    # Look in definitions for the field schema
+    definitions = schema_dict.get("definitions", {})
+    for _, def_schema in definitions.items():
+        properties = def_schema.get("properties", {})
+        if field_name in properties:
+            field_schema = properties[field_name]
+            schema_extra = field_schema.get("schema_extra", {})
+            supported_sources = schema_extra.get("supported_sources")
+            
+            if supported_sources and current_source:
+                return current_source.lower() not in [s.lower() for s in supported_sources]
+    
+    return False
+
+def gen_md_table_from_json_schema(schema_dict: Dict[str, Any], current_source: Optional[str] = None) -> str:
    # we don't want default field values to be injected into the description of the field
    JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False
    schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict))
@ -352,6 +371,8 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
    field_tree = FieldTree(field=None)
    for field in schema_fields:
        row: FieldRow = FieldRow.from_schema_field(field)
+        if current_source and should_hide_field(field, current_source, schema_dict):
+            continue
        field_tree.add_field(row)

    field_tree.sort()
@ -365,12 +386,13 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
    return "".join(result)


-def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str:
-    return gen_md_table_from_json_schema(model.schema())
+def gen_md_table_from_pydantic(model: Type[BaseModel], current_source: Optional[str] = None) -> str:
+    return gen_md_table_from_json_schema(model.schema(), current_source)
+


 if __name__ == "__main__":
    # Simple test code.
    from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config

-    print("".join(gen_md_table_from_pydantic(SnowflakeV2Config)))
+    print("".join(gen_md_table_from_pydantic(SnowflakeV2Config, current_source="snowflake")))
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
        description="Profile table only if it has been updated since these many number of days. "
        "If set to `null`, no constraint of last modified time for tables to profile. "
        "Supported only in `snowflake` and `BigQuery`.",
+        schema_extra={"supported_sources": ["snowflake", "bigquery"]},
    )

    profile_table_size_limit: Optional[int] = Field(
@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
        description="Profile tables only if their size is less than specified GBs. If set to `null`, "
        "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
        "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
+        schema_extra={
+            "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
+        },
    )

    profile_table_row_limit: Optional[int] = Field(
@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
        description="Profile tables only if their row count is less than specified count. "
        "If set to `null`, no limit on the row count of tables to profile. Supported only in "
        "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
+        schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
    )

    profile_table_row_count_estimate_only: bool = Field(
        default=False,
        description="Use an approximate query for row count. This will be much faster but slightly "
        "less accurate. Only supported for Postgres and MySQL. ",
+        schema_extra={"supported_sources": ["postgres", "mysql"]},
    )

    # The query combiner enables us to combine multiple queries into a single query,
@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
        default=True,
        description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
        "If enabled, latest partition data is used for profiling.",
+        schema_extra={"supported_sources": ["athena", "bigquery"]},
    )
    partition_datetime: Optional[datetime.datetime] = Field(
        default=None,
        description="If specified, profile only the partition which matches this datetime. "
        "If not specified, profile the latest partition. Only Bigquery supports this.",
+        schema_extra={"supported_sources": ["bigquery"]},
    )
    use_sampling: bool = Field(
        default=True,
        description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
        "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
+        schema_extra={"supported_sources": ["bigquery", "snowflake"]},
    )

    sample_size: int = Field(
        default=10000,
        description="Number of rows to be sampled from table for column level profiling."
        "Applicable only if `use_sampling` is set to True.",
+        schema_extra={"supported_sources": ["bigquery", "snowflake"]},
    )

    profile_external_tables: bool = Field(
        default=False,
        description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
+        schema_extra={"supported_sources": ["redshift", "snowflake"]},
    )

    tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(