From 85c8e605be045deb59f7548380b550d12e70c900 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 19 Nov 2024 15:06:16 -0800 Subject: [PATCH] fix(ingest): consider sql parsing fallback as failure (#11896) --- metadata-ingestion/src/datahub/cli/check_cli.py | 4 +++- .../src/datahub/sql_parsing/sqlglot_lineage.py | 9 +++++++++ .../goldens/test_sqlite_attach_database.json | 12 ++++++++++++ .../tests/unit/sql_parsing/test_sqlglot_lineage.py | 11 +++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 39ed1b2bfe..fbe07b64f0 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -268,7 +268,9 @@ def sql_lineage( ) logger.debug("Sql parsing debug info: %s", lineage.debug_info) - if lineage.debug_info.error: + if lineage.debug_info.table_error: + raise lineage.debug_info.table_error + elif lineage.debug_info.error: logger.debug("Sql parsing error details", exc_info=lineage.debug_info.error) click.echo(lineage.json(indent=4)) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index b635f8cb47..506bd1d8c6 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -904,6 +904,15 @@ def _sqlglot_lineage_inner( logger.debug("Parsing lineage from sql statement: %s", sql) statement = parse_statement(sql, dialect=dialect) + if isinstance(statement, sqlglot.exp.Command): + # For unsupported syntax, sqlglot will usually fallback to parsing as a Command. + # This is effectively a parsing error, and we won't get any lineage from it. + # See https://github.com/tobymao/sqlglot/commit/3a13fdf4e597a2f0a3f9fc126a129183fe98262f + # and https://github.com/tobymao/sqlglot/pull/2874 + raise UnsupportedStatementTypeError( + f"Got unsupported syntax for statement: {sql}" + ) + original_statement, statement = statement, statement.copy() # logger.debug( # "Formatted sql statement: %s", diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json new file mode 100644 index 0000000000..bcf31f6be8 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json @@ -0,0 +1,12 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": null, + "in_tables": [], + "out_tables": [], + "column_lineage": null, + "debug_info": { + "confidence": 0.0, + "generalized_statement": null + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 90cc863d6b..1703412302 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -1268,3 +1268,14 @@ WHERE rank_ = 1 dialect="bigquery", expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json", ) + + +def test_sqlite_attach_database() -> None: + assert_sql_result( + """\ +ATTACH DATABASE ':memory:' AS aux1 +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_sqlite_attach_database.json", + allow_table_error=True, + )