feat(docs): add showing specific fields to docs of specific connectors (#13810)

This commit is contained in:
Aseem Bansal 2025-06-19 15:08:11 +05:30 committed by GitHub
parent b12b9aa919
commit 85b29c9361
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 38 additions and 5 deletions

View File

@ -198,7 +198,7 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
source_config_class: ConfigModel = source_type.get_config_class()
plugin.config_json_schema = source_config_class.schema_json(indent=2)
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema())
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
# Write the config json schema to the out_dir.
config_dir = pathlib.Path(out_dir) / "config_schemas"

View File

@ -343,7 +343,26 @@ def priority_value(path: str) -> str:
return "A"
def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str, Any]) -> bool:
"""Check if field should be hidden for the current source"""
# Extract field name from the path
field_name = schema_field.fieldPath.split('.')[-1]
# Look in definitions for the field schema
definitions = schema_dict.get("definitions", {})
for _, def_schema in definitions.items():
properties = def_schema.get("properties", {})
if field_name in properties:
field_schema = properties[field_name]
schema_extra = field_schema.get("schema_extra", {})
supported_sources = schema_extra.get("supported_sources")
if supported_sources and current_source:
return current_source.lower() not in [s.lower() for s in supported_sources]
return False
def gen_md_table_from_json_schema(schema_dict: Dict[str, Any], current_source: Optional[str] = None) -> str:
# we don't want default field values to be injected into the description of the field
JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False
schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict))
@ -352,6 +371,8 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
field_tree = FieldTree(field=None)
for field in schema_fields:
row: FieldRow = FieldRow.from_schema_field(field)
if current_source and should_hide_field(field, current_source, schema_dict):
continue
field_tree.add_field(row)
field_tree.sort()
@ -365,12 +386,13 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
return "".join(result)
def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str:
return gen_md_table_from_json_schema(model.schema())
def gen_md_table_from_pydantic(model: Type[BaseModel], current_source: Optional[str] = None) -> str:
return gen_md_table_from_json_schema(model.schema(), current_source)
if __name__ == "__main__":
# Simple test code.
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
print("".join(gen_md_table_from_pydantic(SnowflakeV2Config)))
print("".join(gen_md_table_from_pydantic(SnowflakeV2Config, current_source="snowflake")))

View File

@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
description="Profile table only if it has been updated since these many number of days. "
"If set to `null`, no constraint of last modified time for tables to profile. "
"Supported only in `snowflake` and `BigQuery`.",
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
)
profile_table_size_limit: Optional[int] = Field(
@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
schema_extra={
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
},
)
profile_table_row_limit: Optional[int] = Field(
@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
description="Profile tables only if their row count is less than specified count. "
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
)
profile_table_row_count_estimate_only: bool = Field(
default=False,
description="Use an approximate query for row count. This will be much faster but slightly "
"less accurate. Only supported for Postgres and MySQL. ",
schema_extra={"supported_sources": ["postgres", "mysql"]},
)
# The query combiner enables us to combine multiple queries into a single query,
@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
default=True,
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
"If enabled, latest partition data is used for profiling.",
schema_extra={"supported_sources": ["athena", "bigquery"]},
)
partition_datetime: Optional[datetime.datetime] = Field(
default=None,
description="If specified, profile only the partition which matches this datetime. "
"If not specified, profile the latest partition. Only Bigquery supports this.",
schema_extra={"supported_sources": ["bigquery"]},
)
use_sampling: bool = Field(
default=True,
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
)
sample_size: int = Field(
default=10000,
description="Number of rows to be sampled from table for column level profiling."
"Applicable only if `use_sampling` is set to True.",
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
)
profile_external_tables: bool = Field(
default=False,
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
schema_extra={"supported_sources": ["redshift", "snowflake"]},
)
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(