feat(docs): add showing specific fields to docs of specific connectors (#13810)

This commit is contained in:
Aseem Bansal 2025-06-19 15:08:11 +05:30 committed by GitHub
parent b12b9aa919
commit 85b29c9361
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 38 additions and 5 deletions

View File

@ -198,7 +198,7 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
source_config_class: ConfigModel = source_type.get_config_class() source_config_class: ConfigModel = source_type.get_config_class()
plugin.config_json_schema = source_config_class.schema_json(indent=2) plugin.config_json_schema = source_config_class.schema_json(indent=2)
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema()) plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
# Write the config json schema to the out_dir. # Write the config json schema to the out_dir.
config_dir = pathlib.Path(out_dir) / "config_schemas" config_dir = pathlib.Path(out_dir) / "config_schemas"

View File

@ -343,7 +343,26 @@ def priority_value(path: str) -> str:
return "A" return "A"
def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str: def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str, Any]) -> bool:
"""Check if field should be hidden for the current source"""
# Extract field name from the path
field_name = schema_field.fieldPath.split('.')[-1]
# Look in definitions for the field schema
definitions = schema_dict.get("definitions", {})
for _, def_schema in definitions.items():
properties = def_schema.get("properties", {})
if field_name in properties:
field_schema = properties[field_name]
schema_extra = field_schema.get("schema_extra", {})
supported_sources = schema_extra.get("supported_sources")
if supported_sources and current_source:
return current_source.lower() not in [s.lower() for s in supported_sources]
return False
def gen_md_table_from_json_schema(schema_dict: Dict[str, Any], current_source: Optional[str] = None) -> str:
# we don't want default field values to be injected into the description of the field # we don't want default field values to be injected into the description of the field
JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False
schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict)) schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict))
@ -352,6 +371,8 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
field_tree = FieldTree(field=None) field_tree = FieldTree(field=None)
for field in schema_fields: for field in schema_fields:
row: FieldRow = FieldRow.from_schema_field(field) row: FieldRow = FieldRow.from_schema_field(field)
if current_source and should_hide_field(field, current_source, schema_dict):
continue
field_tree.add_field(row) field_tree.add_field(row)
field_tree.sort() field_tree.sort()
@ -365,12 +386,13 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
return "".join(result) return "".join(result)
def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str: def gen_md_table_from_pydantic(model: Type[BaseModel], current_source: Optional[str] = None) -> str:
return gen_md_table_from_json_schema(model.schema()) return gen_md_table_from_json_schema(model.schema(), current_source)
if __name__ == "__main__": if __name__ == "__main__":
# Simple test code. # Simple test code.
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
print("".join(gen_md_table_from_pydantic(SnowflakeV2Config))) print("".join(gen_md_table_from_pydantic(SnowflakeV2Config, current_source="snowflake")))

View File

@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
description="Profile table only if it has been updated since these many number of days. " description="Profile table only if it has been updated since these many number of days. "
"If set to `null`, no constraint of last modified time for tables to profile. " "If set to `null`, no constraint of last modified time for tables to profile. "
"Supported only in `snowflake` and `BigQuery`.", "Supported only in `snowflake` and `BigQuery`.",
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
) )
profile_table_size_limit: Optional[int] = Field( profile_table_size_limit: Optional[int] = Field(
@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
description="Profile tables only if their size is less than specified GBs. If set to `null`, " description="Profile tables only if their size is less than specified GBs. If set to `null`, "
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and " "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.", "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
schema_extra={
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
},
) )
profile_table_row_limit: Optional[int] = Field( profile_table_row_limit: Optional[int] = Field(
@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
description="Profile tables only if their row count is less than specified count. " description="Profile tables only if their row count is less than specified count. "
"If set to `null`, no limit on the row count of tables to profile. Supported only in " "If set to `null`, no limit on the row count of tables to profile. Supported only in "
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.", "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
) )
profile_table_row_count_estimate_only: bool = Field( profile_table_row_count_estimate_only: bool = Field(
default=False, default=False,
description="Use an approximate query for row count. This will be much faster but slightly " description="Use an approximate query for row count. This will be much faster but slightly "
"less accurate. Only supported for Postgres and MySQL. ", "less accurate. Only supported for Postgres and MySQL. ",
schema_extra={"supported_sources": ["postgres", "mysql"]},
) )
# The query combiner enables us to combine multiple queries into a single query, # The query combiner enables us to combine multiple queries into a single query,
@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
default=True, default=True,
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. " description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
"If enabled, latest partition data is used for profiling.", "If enabled, latest partition data is used for profiling.",
schema_extra={"supported_sources": ["athena", "bigquery"]},
) )
partition_datetime: Optional[datetime.datetime] = Field( partition_datetime: Optional[datetime.datetime] = Field(
default=None, default=None,
description="If specified, profile only the partition which matches this datetime. " description="If specified, profile only the partition which matches this datetime. "
"If not specified, profile the latest partition. Only Bigquery supports this.", "If not specified, profile the latest partition. Only Bigquery supports this.",
schema_extra={"supported_sources": ["bigquery"]},
) )
use_sampling: bool = Field( use_sampling: bool = Field(
default=True, default=True,
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. " description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
) )
sample_size: int = Field( sample_size: int = Field(
default=10000, default=10000,
description="Number of rows to be sampled from table for column level profiling." description="Number of rows to be sampled from table for column level profiling."
"Applicable only if `use_sampling` is set to True.", "Applicable only if `use_sampling` is set to True.",
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
) )
profile_external_tables: bool = Field( profile_external_tables: bool = Field(
default=False, default=False,
description="Whether to profile external tables. Only Snowflake and Redshift supports this.", description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
schema_extra={"supported_sources": ["redshift", "snowflake"]},
) )
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field( tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(