diff --git a/CLAUDE.md b/CLAUDE.md index 0a2b813e2d..40e76b006a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -89,6 +89,11 @@ DataHub is a **schema-first, event-driven metadata platform** with three core la - Frontend: Tests in `__tests__/` or `.test.tsx` files - Smoke tests go in the `smoke-test/` directory +### Commits + +- Follow Conventional Commits format for commit messages +- Breaking Changes: Always update `docs/how/updating-datahub.md` for breaking changes. Write entries for non-technical audiences, reference the PR number, and focus on what users need to change rather than internal implementation details + ## Key Documentation **Essential reading:** @@ -107,4 +112,3 @@ DataHub is a **schema-first, event-driven metadata platform** with three core la - Entity Registry is defined in YAML, not code (`entity-registry.yml`) - All metadata changes flow through the event streaming system - GraphQL schema is generated from backend GMS APIs -- Follow Conventional Commits format for commit messages diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index ecc1542fad..12c6674d96 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -37,6 +37,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - `acryl-datahub-gx-plugin` - `acryl-datahub-dagster-plugin` (already required Python 3.9+) - #13619: The `acryl-datahub-airflow-plugin` has dropped support for Airflow versions less than 2.7. +- #14015: In the sql-queries source, the `default_dialect` configuration parameter has been renamed to `override_dialect`. This also affects the Python SDK methods: + - `DataHubGraph.parse_sql_lineage(default_dialect=...)` → `DataHubGraph.parse_sql_lineage(override_dialect=...)` + - `LineageClient.add_lineage_via_sql(default_dialect=...)` → `LineageClient.add_lineage_via_sql(override_dialect=...)` ### Known Issues diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 0870af8603..1179c3c5d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -1576,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI): env: str = DEFAULT_ENV, default_db: Optional[str] = None, default_schema: Optional[str] = None, - default_dialect: Optional[str] = None, + override_dialect: Optional[str] = None, ) -> "SqlParsingResult": from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage @@ -1590,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI): schema_resolver=schema_resolver, default_db=default_db, default_schema=default_schema, - default_dialect=default_dialect, + override_dialect=override_dialect, ) def create_tag(self, tag_name: str) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py index f3e8e774e4..ea5dc07252 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): description="The default schema to use for unqualified table names", default=None, ) - default_dialect: Optional[str] = Field( + override_dialect: Optional[str] = Field( description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.", default=None, ) @@ -181,7 +181,7 @@ class SqlQueriesSource(Source): schema_resolver=self.schema_resolver, default_db=self.config.default_db, default_schema=self.config.default_schema, - default_dialect=self.config.default_dialect, + override_dialect=self.config.override_dialect, ) if result.debug_info.table_error: logger.info(f"Error parsing table lineage, {result.debug_info.table_error}") diff --git a/metadata-ingestion/src/datahub/sdk/lineage_client.py b/metadata-ingestion/src/datahub/sdk/lineage_client.py index f1034a373e..9c585be58b 100644 --- a/metadata-ingestion/src/datahub/sdk/lineage_client.py +++ b/metadata-ingestion/src/datahub/sdk/lineage_client.py @@ -478,7 +478,7 @@ class LineageClient: env: str = "PROD", default_db: Optional[str] = None, default_schema: Optional[str] = None, - default_dialect: Optional[str] = None, + override_dialect: Optional[str] = None, ) -> None: """Add lineage by parsing a SQL query.""" from datahub.sql_parsing.sqlglot_lineage import ( @@ -494,7 +494,7 @@ class LineageClient: platform_instance=platform_instance, env=env, graph=self._client._graph, - default_dialect=default_dialect, + override_dialect=override_dialect, ) if parsed_result.debug_info.table_error: diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index ad98d36a02..2e4d13d9c3 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import ( QueryTypeProps, ) from datahub.sql_parsing.sqlglot_utils import ( + DialectOrStr, get_dialect, get_query_fingerprint_debug, is_dialect_instance, @@ -1231,12 +1232,12 @@ def _sqlglot_lineage_inner( schema_resolver: SchemaResolverInterface, default_db: Optional[str] = None, default_schema: Optional[str] = None, - default_dialect: Optional[str] = None, + override_dialect: Optional[DialectOrStr] = None, ) -> SqlParsingResult: - if not default_dialect: - dialect = get_dialect(schema_resolver.platform) + if override_dialect: + dialect = get_dialect(override_dialect) else: - dialect = get_dialect(default_dialect) + dialect = get_dialect(schema_resolver.platform) default_db = _normalize_db_or_schema(default_db, dialect) default_schema = _normalize_db_or_schema(default_schema, dialect) @@ -1423,7 +1424,7 @@ def _sqlglot_lineage_nocache( schema_resolver: SchemaResolverInterface, default_db: Optional[str] = None, default_schema: Optional[str] = None, - default_dialect: Optional[str] = None, + override_dialect: Optional[DialectOrStr] = None, ) -> SqlParsingResult: """Parse a SQL statement and generate lineage information. @@ -1441,8 +1442,8 @@ def _sqlglot_lineage_nocache( can be brittle with respect to missing schema information and complex SQL logic like UNNESTs. - The SQL dialect can be given as an argument called default_dialect or it can - be inferred from the schema_resolver's platform. + The SQL dialect will be inferred from the schema_resolver's platform. + That inference can be overridden by passing an override_dialect argument. The set of supported dialects is the same as sqlglot's. See their `documentation `_ for the full list. @@ -1457,7 +1458,7 @@ def _sqlglot_lineage_nocache( schema_resolver: The schema resolver to use for resolving table schemas. default_db: The default database to use for unqualified table names. default_schema: The default schema to use for unqualified table names. - default_dialect: A default dialect to override the dialect provided by 'schema_resolver'. + override_dialect: Override the dialect provided by 'schema_resolver'. Returns: A SqlParsingResult object containing the parsed lineage information. @@ -1482,7 +1483,7 @@ def _sqlglot_lineage_nocache( schema_resolver=schema_resolver, default_db=default_db, default_schema=default_schema, - default_dialect=default_dialect, + override_dialect=override_dialect, ) except Exception as e: return SqlParsingResult.make_from_error(e) @@ -1520,15 +1521,15 @@ def sqlglot_lineage( schema_resolver: SchemaResolverInterface, default_db: Optional[str] = None, default_schema: Optional[str] = None, - default_dialect: Optional[str] = None, + override_dialect: Optional[DialectOrStr] = None, ) -> SqlParsingResult: if schema_resolver.includes_temp_tables(): return _sqlglot_lineage_nocache( - sql, schema_resolver, default_db, default_schema, default_dialect + sql, schema_resolver, default_db, default_schema, override_dialect ) else: return _sqlglot_lineage_cached( - sql, schema_resolver, default_db, default_schema, default_dialect + sql, schema_resolver, default_db, default_schema, override_dialect ) @@ -1580,7 +1581,7 @@ def create_lineage_sql_parsed_result( default_schema: Optional[str] = None, graph: Optional[DataHubGraph] = None, schema_aware: bool = True, - default_dialect: Optional[str] = None, + override_dialect: Optional[DialectOrStr] = None, ) -> SqlParsingResult: schema_resolver = create_schema_resolver( platform=platform, @@ -1600,7 +1601,7 @@ def create_lineage_sql_parsed_result( schema_resolver=schema_resolver, default_db=default_db, default_schema=default_schema, - default_dialect=default_dialect, + override_dialect=override_dialect, ) except Exception as e: return SqlParsingResult.make_from_error(e)