mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
feat(docs): add showing specific fields to docs of specific connectors (#13810)
This commit is contained in:
parent
b12b9aa919
commit
85b29c9361
@ -198,7 +198,7 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
|
||||
source_config_class: ConfigModel = source_type.get_config_class()
|
||||
|
||||
plugin.config_json_schema = source_config_class.schema_json(indent=2)
|
||||
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema())
|
||||
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
|
||||
|
||||
# Write the config json schema to the out_dir.
|
||||
config_dir = pathlib.Path(out_dir) / "config_schemas"
|
||||
|
@ -343,7 +343,26 @@ def priority_value(path: str) -> str:
|
||||
return "A"
|
||||
|
||||
|
||||
def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
|
||||
def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str, Any]) -> bool:
|
||||
"""Check if field should be hidden for the current source"""
|
||||
# Extract field name from the path
|
||||
field_name = schema_field.fieldPath.split('.')[-1]
|
||||
|
||||
# Look in definitions for the field schema
|
||||
definitions = schema_dict.get("definitions", {})
|
||||
for _, def_schema in definitions.items():
|
||||
properties = def_schema.get("properties", {})
|
||||
if field_name in properties:
|
||||
field_schema = properties[field_name]
|
||||
schema_extra = field_schema.get("schema_extra", {})
|
||||
supported_sources = schema_extra.get("supported_sources")
|
||||
|
||||
if supported_sources and current_source:
|
||||
return current_source.lower() not in [s.lower() for s in supported_sources]
|
||||
|
||||
return False
|
||||
|
||||
def gen_md_table_from_json_schema(schema_dict: Dict[str, Any], current_source: Optional[str] = None) -> str:
|
||||
# we don't want default field values to be injected into the description of the field
|
||||
JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False
|
||||
schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict))
|
||||
@ -352,6 +371,8 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
|
||||
field_tree = FieldTree(field=None)
|
||||
for field in schema_fields:
|
||||
row: FieldRow = FieldRow.from_schema_field(field)
|
||||
if current_source and should_hide_field(field, current_source, schema_dict):
|
||||
continue
|
||||
field_tree.add_field(row)
|
||||
|
||||
field_tree.sort()
|
||||
@ -365,12 +386,13 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str:
|
||||
return "".join(result)
|
||||
|
||||
|
||||
def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str:
|
||||
return gen_md_table_from_json_schema(model.schema())
|
||||
def gen_md_table_from_pydantic(model: Type[BaseModel], current_source: Optional[str] = None) -> str:
|
||||
return gen_md_table_from_json_schema(model.schema(), current_source)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Simple test code.
|
||||
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
|
||||
|
||||
print("".join(gen_md_table_from_pydantic(SnowflakeV2Config)))
|
||||
print("".join(gen_md_table_from_pydantic(SnowflakeV2Config, current_source="snowflake")))
|
||||
|
@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
||||
description="Profile table only if it has been updated since these many number of days. "
|
||||
"If set to `null`, no constraint of last modified time for tables to profile. "
|
||||
"Supported only in `snowflake` and `BigQuery`.",
|
||||
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
|
||||
)
|
||||
|
||||
profile_table_size_limit: Optional[int] = Field(
|
||||
@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
||||
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
||||
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
||||
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
||||
schema_extra={
|
||||
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
|
||||
},
|
||||
)
|
||||
|
||||
profile_table_row_limit: Optional[int] = Field(
|
||||
@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
||||
description="Profile tables only if their row count is less than specified count. "
|
||||
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
||||
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
||||
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
|
||||
)
|
||||
|
||||
profile_table_row_count_estimate_only: bool = Field(
|
||||
default=False,
|
||||
description="Use an approximate query for row count. This will be much faster but slightly "
|
||||
"less accurate. Only supported for Postgres and MySQL. ",
|
||||
schema_extra={"supported_sources": ["postgres", "mysql"]},
|
||||
)
|
||||
|
||||
# The query combiner enables us to combine multiple queries into a single query,
|
||||
@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
||||
default=True,
|
||||
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
|
||||
"If enabled, latest partition data is used for profiling.",
|
||||
schema_extra={"supported_sources": ["athena", "bigquery"]},
|
||||
)
|
||||
partition_datetime: Optional[datetime.datetime] = Field(
|
||||
default=None,
|
||||
description="If specified, profile only the partition which matches this datetime. "
|
||||
"If not specified, profile the latest partition. Only Bigquery supports this.",
|
||||
schema_extra={"supported_sources": ["bigquery"]},
|
||||
)
|
||||
use_sampling: bool = Field(
|
||||
default=True,
|
||||
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
|
||||
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
|
||||
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
||||
)
|
||||
|
||||
sample_size: int = Field(
|
||||
default=10000,
|
||||
description="Number of rows to be sampled from table for column level profiling."
|
||||
"Applicable only if `use_sampling` is set to True.",
|
||||
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
||||
)
|
||||
|
||||
profile_external_tables: bool = Field(
|
||||
default=False,
|
||||
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
|
||||
schema_extra={"supported_sources": ["redshift", "snowflake"]},
|
||||
)
|
||||
|
||||
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
|
||||
|
Loading…
x
Reference in New Issue
Block a user