diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py index e462f46221..7cc49281e4 100644 --- a/metadata-ingestion/scripts/docgen.py +++ b/metadata-ingestion/scripts/docgen.py @@ -198,7 +198,7 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin: source_config_class: ConfigModel = source_type.get_config_class() plugin.config_json_schema = source_config_class.schema_json(indent=2) - plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema()) + plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name) # Write the config json schema to the out_dir. config_dir = pathlib.Path(out_dir) / "config_schemas" diff --git a/metadata-ingestion/scripts/docs_config_table.py b/metadata-ingestion/scripts/docs_config_table.py index 3c5d9d0b0a..58cf066b73 100644 --- a/metadata-ingestion/scripts/docs_config_table.py +++ b/metadata-ingestion/scripts/docs_config_table.py @@ -343,7 +343,26 @@ def priority_value(path: str) -> str: return "A" -def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str: +def should_hide_field(schema_field, current_source: str, schema_dict: Dict[str, Any]) -> bool: + """Check if field should be hidden for the current source""" + # Extract field name from the path + field_name = schema_field.fieldPath.split('.')[-1] + + # Look in definitions for the field schema + definitions = schema_dict.get("definitions", {}) + for _, def_schema in definitions.items(): + properties = def_schema.get("properties", {}) + if field_name in properties: + field_schema = properties[field_name] + schema_extra = field_schema.get("schema_extra", {}) + supported_sources = schema_extra.get("supported_sources") + + if supported_sources and current_source: + return current_source.lower() not in [s.lower() for s in supported_sources] + + return False + +def gen_md_table_from_json_schema(schema_dict: Dict[str, Any], current_source: Optional[str] = None) -> str: # we don't want default field values to be injected into the description of the field JsonSchemaTranslator._INJECT_DEFAULTS_INTO_DESCRIPTION = False schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(schema_dict)) @@ -352,6 +371,8 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str: field_tree = FieldTree(field=None) for field in schema_fields: row: FieldRow = FieldRow.from_schema_field(field) + if current_source and should_hide_field(field, current_source, schema_dict): + continue field_tree.add_field(row) field_tree.sort() @@ -365,12 +386,13 @@ def gen_md_table_from_json_schema(schema_dict: Dict[str, Any]) -> str: return "".join(result) -def gen_md_table_from_pydantic(model: Type[BaseModel]) -> str: - return gen_md_table_from_json_schema(model.schema()) +def gen_md_table_from_pydantic(model: Type[BaseModel], current_source: Optional[str] = None) -> str: + return gen_md_table_from_json_schema(model.schema(), current_source) + if __name__ == "__main__": # Simple test code. from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config - print("".join(gen_md_table_from_pydantic(SnowflakeV2Config))) + print("".join(gen_md_table_from_pydantic(SnowflakeV2Config, current_source="snowflake"))) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 9c251c040b..7b9bd9dd09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig): description="Profile table only if it has been updated since these many number of days. " "If set to `null`, no constraint of last modified time for tables to profile. " "Supported only in `snowflake` and `BigQuery`.", + schema_extra={"supported_sources": ["snowflake", "bigquery"]}, ) profile_table_size_limit: Optional[int] = Field( @@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig): description="Profile tables only if their size is less than specified GBs. If set to `null`, " "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and " "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.", + schema_extra={ + "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"] + }, ) profile_table_row_limit: Optional[int] = Field( @@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig): description="Profile tables only if their row count is less than specified count. " "If set to `null`, no limit on the row count of tables to profile. Supported only in " "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.", + schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]}, ) profile_table_row_count_estimate_only: bool = Field( default=False, description="Use an approximate query for row count. This will be much faster but slightly " "less accurate. Only supported for Postgres and MySQL. ", + schema_extra={"supported_sources": ["postgres", "mysql"]}, ) # The query combiner enables us to combine multiple queries into a single query, @@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig): default=True, description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. " "If enabled, latest partition data is used for profiling.", + schema_extra={"supported_sources": ["athena", "bigquery"]}, ) partition_datetime: Optional[datetime.datetime] = Field( default=None, description="If specified, profile only the partition which matches this datetime. " "If not specified, profile the latest partition. Only Bigquery supports this.", + schema_extra={"supported_sources": ["bigquery"]}, ) use_sampling: bool = Field( default=True, description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. " "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", + schema_extra={"supported_sources": ["bigquery", "snowflake"]}, ) sample_size: int = Field( default=10000, description="Number of rows to be sampled from table for column level profiling." "Applicable only if `use_sampling` is set to True.", + schema_extra={"supported_sources": ["bigquery", "snowflake"]}, ) profile_external_tables: bool = Field( default=False, description="Whether to profile external tables. Only Snowflake and Redshift supports this.", + schema_extra={"supported_sources": ["redshift", "snowflake"]}, ) tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(