2023-12-28 15:24:26 +05:30
|
|
|
from typing import List
|
|
|
|
|
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigQueryTableRef
|
2024-02-09 16:27:45 -05:00
|
|
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
|
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
2023-12-28 15:24:26 +05:30
|
|
|
|
|
|
|
|
|
|
|
class BigQuerySQLParser:
|
|
|
|
def __init__(self, sql_query: str, schema_resolver: SchemaResolver) -> None:
|
|
|
|
self.result = sqlglot_lineage(sql_query, schema_resolver)
|
|
|
|
|
|
|
|
def get_tables(self) -> List[str]:
|
|
|
|
ans = []
|
|
|
|
for urn in self.result.in_tables:
|
|
|
|
table_ref = BigQueryTableRef.from_urn(urn)
|
|
|
|
ans.append(str(table_ref.table_identifier))
|
|
|
|
return ans
|
|
|
|
|
|
|
|
def get_columns(self) -> List[str]:
|
|
|
|
ans = []
|
|
|
|
for col_info in self.result.column_lineage or []:
|
|
|
|
for col_ref in col_info.upstreams:
|
|
|
|
ans.append(col_ref.column)
|
|
|
|
return ans
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_basic():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""SELECT * FROM project_1.database_1.view_1""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project_1.database_1.view_1"]
|
2022-05-03 11:13:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM `project.dataset.src_tbl`
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.dataset.src_tbl"]
|
|
|
|
|
|
|
|
|
2022-12-14 10:58:03 -03:00
|
|
|
def test_bigquery_sql_lineage_camel_case_table():
|
|
|
|
"""
|
|
|
|
This test aims to test the parameter to ignore sqllineage lowercasing.
|
|
|
|
On the BigQuery service, it's possible to use uppercase name un datasets and tables.
|
|
|
|
The lowercasing, by default, breaks the lineage construction in these cases.
|
|
|
|
"""
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo, bar
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM `project.dataset.CamelCaseTable`
|
|
|
|
""",
|
2023-12-28 15:24:26 +05:30
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-12-14 10:58:03 -03:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.dataset.CamelCaseTable"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_camel_case_dataset():
|
|
|
|
"""
|
|
|
|
This test aims to test the parameter to ignore sqllineage lowercasing.
|
|
|
|
On the BigQuery service, it's possible to use uppercase name un datasets and tables.
|
|
|
|
The lowercasing, by default, breaks the lineage construction in these cases.
|
|
|
|
"""
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo, bar
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM `project.DataSet.table`
|
|
|
|
""",
|
2023-12-28 15:24:26 +05:30
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-12-14 10:58:03 -03:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.DataSet.table"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_camel_case_table_and_dataset():
|
|
|
|
"""
|
|
|
|
This test aims to test the parameter to ignore sqllineage lowercasing.
|
|
|
|
On the BigQuery service, it's possible to use uppercase name un datasets and tables.
|
|
|
|
The lowercasing, by default, breaks the lineage construction in these cases.
|
|
|
|
"""
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo, bar
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM `project.DataSet.CamelTable`
|
|
|
|
""",
|
2023-12-28 15:24:26 +05:30
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-12-14 10:58:03 -03:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.DataSet.CamelTable"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_camel_case_table_and_dataset_subquery():
|
|
|
|
"""
|
|
|
|
This test aims to test the parameter to ignore sqllineage lowercasing.
|
|
|
|
On the BigQuery service, it's possible to use uppercase name un datasets and tables.
|
|
|
|
The lowercasing, by default, breaks the lineage construction in these cases.
|
|
|
|
"""
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo, bar
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM (
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
SELECT * FROM `project.DataSet.CamelTable`
|
|
|
|
)
|
|
|
|
""",
|
2023-12-28 15:24:26 +05:30
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-12-14 10:58:03 -03:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.DataSet.CamelTable"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins():
|
|
|
|
"""
|
|
|
|
This test aims to test the parameter to ignore sqllineage lowercasing.
|
|
|
|
On the BigQuery service, it's possible to use uppercase name un datasets and tables.
|
|
|
|
The lowercasing, by default, breaks the lineage construction in these cases.
|
|
|
|
"""
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo, bar
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM `project.DataSet1.CamelTable`
|
|
|
|
INNER JOIN `project.DataSet2.CamelTable2`
|
|
|
|
ON b.id = a.id
|
|
|
|
LEFT JOIN `project.DataSet3.CamelTable3`
|
|
|
|
on c.id = b.id
|
|
|
|
""",
|
2023-12-28 15:24:26 +05:30
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-12-14 10:58:03 -03:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == [
|
|
|
|
"project.DataSet1.CamelTable",
|
|
|
|
"project.DataSet2.CamelTable2",
|
|
|
|
"project.DataSet3.CamelTable3",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins_and_subquery():
|
|
|
|
"""
|
|
|
|
This test aims to test the parameter to ignore sqllineage lowercasing.
|
|
|
|
On the BigQuery service, it's possible to use uppercase name un datasets and tables.
|
|
|
|
The lowercasing, by default, breaks the lineage construction in these cases.
|
|
|
|
"""
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
/*
|
|
|
|
HERE IS A STANDARD COMMENT BLOCK
|
|
|
|
THIS WILL NOT BREAK sqllineage
|
|
|
|
*/
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS
|
|
|
|
#This, comment will not break sqllineage
|
|
|
|
SELECT foo, bar
|
|
|
|
-- this comment will not break sqllineage either
|
|
|
|
# this comment will not break sqllineage either
|
|
|
|
FROM `project.DataSet1.CamelTable` a
|
|
|
|
INNER JOIN `project.DataSet2.CamelTable2` b
|
|
|
|
ON b.id = a.id
|
|
|
|
LEFT JOIN (SELECT * FROM `project.DataSet3.CamelTable3`) c
|
|
|
|
ON c.id = b.id
|
|
|
|
""",
|
2023-12-28 15:24:26 +05:30
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-12-14 10:58:03 -03:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == [
|
|
|
|
"project.DataSet1.CamelTable",
|
|
|
|
"project.DataSet2.CamelTable2",
|
|
|
|
"project.DataSet3.CamelTable3",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2022-05-03 11:13:23 +02:00
|
|
|
def test_bigquery_sql_lineage_keyword_data_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
WITH data AS (
|
|
|
|
SELECT
|
|
|
|
*,
|
|
|
|
'foo' AS bar
|
|
|
|
FROM `project.example_dataset.example_table`
|
|
|
|
)
|
|
|
|
SELECT * FROM data
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.example_dataset.example_table"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_keyword_admin_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
WITH admin AS (
|
|
|
|
SELECT *
|
|
|
|
FROM `project.example_dataset.example_table`
|
|
|
|
)
|
|
|
|
SELECT * FROM admin
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.example_dataset.example_table"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_cte_alias_as_keyword_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
CREATE OR REPLACE TABLE `project.dataset.test_table` AS
|
|
|
|
WITH map AS (
|
|
|
|
SELECT a.col_1,
|
|
|
|
b.col_2
|
|
|
|
FROM (
|
|
|
|
SELECT DISTINCT *
|
|
|
|
FROM (
|
|
|
|
SELECT col_1
|
|
|
|
FROM `project.dataset.source_table_a`
|
|
|
|
)
|
|
|
|
) a
|
|
|
|
JOIN `project.dataset.source_table_b` b
|
|
|
|
ON a.col_1 = b.col_1
|
|
|
|
)
|
|
|
|
SELECT *
|
|
|
|
FROM map
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == [
|
|
|
|
"project.dataset.source_table_a",
|
|
|
|
"project.dataset.source_table_b",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_create_or_replace_view_name_with_hyphens_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
CREATE OR REPLACE VIEW test-project.dataset.test_view AS
|
|
|
|
SELECT *
|
|
|
|
FROM project.dataset.src_table_a
|
2024-09-09 21:09:44 -07:00
|
|
|
UNION ALL
|
2022-05-03 11:13:23 +02:00
|
|
|
SELECT * FROM `project.dataset.src_table_b`
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == [
|
|
|
|
"project.dataset.src_table_a",
|
|
|
|
"project.dataset.src_table_b",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_source_table_name_with_hyphens_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
CREATE OR REPLACE VIEW `project.dataset.test_view` AS
|
|
|
|
SELECT *
|
|
|
|
FROM test-project.dataset.src_table
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["test-project.dataset.src_table"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_sql_lineage_from_as_column_name_is_accepted():
|
|
|
|
parser = BigQuerySQLParser(
|
|
|
|
sql_query="""
|
|
|
|
CREATE OR REPLACE VIEW `project.dataset.test_view` AS
|
|
|
|
SELECT x.from AS col
|
|
|
|
FROM project.dataset.src_table AS x
|
2023-12-28 15:24:26 +05:30
|
|
|
""",
|
|
|
|
schema_resolver=SchemaResolver(platform="bigquery"),
|
2022-05-03 11:13:23 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
assert parser.get_tables() == ["project.dataset.src_table"]
|