mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-29 17:59:24 +00:00
feat(ingestion/sqlglot): preserve CTEs when extracting SELECT from INSERT statements and add corresponding unit test (#14898)
This commit is contained in:
parent
8248999758
commit
00caa38adf
@ -1176,7 +1176,12 @@ def _try_extract_select(
|
|||||||
statement = sqlglot.exp.Select().select("*").from_(statement)
|
statement = sqlglot.exp.Select().select("*").from_(statement)
|
||||||
elif isinstance(statement, sqlglot.exp.Insert):
|
elif isinstance(statement, sqlglot.exp.Insert):
|
||||||
# TODO Need to map column renames in the expressions part of the statement.
|
# TODO Need to map column renames in the expressions part of the statement.
|
||||||
statement = statement.expression
|
# Preserve CTEs when extracting the SELECT expression from INSERT
|
||||||
|
original_ctes = statement.ctes
|
||||||
|
statement = statement.expression # Get the SELECT expression from the INSERT
|
||||||
|
if isinstance(statement, sqlglot.exp.Query) and original_ctes:
|
||||||
|
for cte in original_ctes:
|
||||||
|
statement = statement.with_(alias=cte.alias, as_=cte.this)
|
||||||
elif isinstance(statement, sqlglot.exp.Update):
|
elif isinstance(statement, sqlglot.exp.Update):
|
||||||
# Assumption: the output table is already captured in the modified tables list.
|
# Assumption: the output table is already captured in the modified tables list.
|
||||||
statement = _extract_select_from_update(statement)
|
statement = _extract_select_from_update(statement)
|
||||||
|
|||||||
@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
"query_type": "INSERT",
|
||||||
|
"query_type_props": {},
|
||||||
|
"query_fingerprint": "195448498ded7a1b4df767cf0a5ec53e2fa4c7b011234bafe0a60ff9d7d11c1d",
|
||||||
|
"in_tables": [
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.source_table,PROD)"
|
||||||
|
],
|
||||||
|
"out_tables": [
|
||||||
|
"urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.target_table,PROD)"
|
||||||
|
],
|
||||||
|
"column_lineage": [
|
||||||
|
{
|
||||||
|
"downstream": {
|
||||||
|
"table": "urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.target_table,PROD)",
|
||||||
|
"column": "id",
|
||||||
|
"column_type": null,
|
||||||
|
"native_column_type": null
|
||||||
|
},
|
||||||
|
"upstreams": [
|
||||||
|
{
|
||||||
|
"table": "urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.source_table,PROD)",
|
||||||
|
"column": "id"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logic": {
|
||||||
|
"is_direct_copy": true,
|
||||||
|
"column_logic": "[source_table].[id] AS [id]"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"downstream": {
|
||||||
|
"table": "urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.target_table,PROD)",
|
||||||
|
"column": "name",
|
||||||
|
"column_type": null,
|
||||||
|
"native_column_type": null
|
||||||
|
},
|
||||||
|
"upstreams": [
|
||||||
|
{
|
||||||
|
"table": "urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.source_table,PROD)",
|
||||||
|
"column": "name"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logic": {
|
||||||
|
"is_direct_copy": true,
|
||||||
|
"column_logic": "[source_table].[name] AS [name]"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"downstream": {
|
||||||
|
"table": "urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.target_table,PROD)",
|
||||||
|
"column": "value",
|
||||||
|
"column_type": null,
|
||||||
|
"native_column_type": null
|
||||||
|
},
|
||||||
|
"upstreams": [
|
||||||
|
{
|
||||||
|
"table": "urn:li:dataset:(urn:li:dataPlatform:tsql,db.schema.source_table,PROD)",
|
||||||
|
"column": "value"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logic": {
|
||||||
|
"is_direct_copy": true,
|
||||||
|
"column_logic": "[source_table].[value] AS [value]"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"joins": [],
|
||||||
|
"debug_info": {
|
||||||
|
"confidence": 0.2,
|
||||||
|
"generalized_statement": "WITH temp_cte AS (SELECT id AS id, name AS name, value AS value FROM db.schema.source_table) INSERT INTO db.schema.target_table (id, name, value) SELECT id, name, value FROM temp_cte"
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -199,6 +199,21 @@ insert into downstream (a, c) select a, c from upstream2
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_with_cte() -> None:
|
||||||
|
assert_sql_result(
|
||||||
|
"""
|
||||||
|
WITH temp_cte AS (
|
||||||
|
SELECT id, name, value
|
||||||
|
FROM db.schema.source_table
|
||||||
|
)
|
||||||
|
INSERT INTO db.schema.target_table (id, name, value)
|
||||||
|
SELECT id, name, value FROM temp_cte
|
||||||
|
""",
|
||||||
|
dialect="tsql",
|
||||||
|
expected_file=RESOURCE_DIR / "test_insert_with_cte.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_select_with_full_col_name() -> None:
|
def test_select_with_full_col_name() -> None:
|
||||||
# In this case, `widget` is a struct column.
|
# In this case, `widget` is a struct column.
|
||||||
# This also tests the `default_db` functionality.
|
# This also tests the `default_db` functionality.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user