datahub/metadata-ingestion/tests/unit/test_bigquery_lineage.py

177 lines
6.2 KiB
Python

import datetime
from typing import Dict, List, Set
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
BigQueryTableRef,
QueryEvent,
)
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigqueryView
from datahub.ingestion.source.bigquery_v2.lineage import (
BigqueryLineageExtractor,
LineageEdge,
)
def test_parse_view_lineage():
config = BigQueryV2Config()
report = BigQueryV2Report()
extractor = BigqueryLineageExtractor(config, report)
# ddl = "select * from some_dataset.sometable as a"
ddl = """CREATE VIEW `my-project.my-dataset.test_table`
AS SELECT
* REPLACE(
myrandom(something) AS something)
FROM
`my-project2.my-dataset2.test_physical_table`;
"""
view = BigqueryView(
name="test",
created=datetime.datetime.now(tz=datetime.timezone.utc),
last_altered=datetime.datetime.now(tz=datetime.timezone.utc),
comment="",
view_definition=ddl,
)
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
assert tables is not None
assert 1 == len(tables)
assert "my-project2.my-dataset2.test_physical_table" == tables[0].get_table_name()
def test_parse_view_lineage_with_two_part_table_name():
config = BigQueryV2Config()
report = BigQueryV2Report()
extractor = BigqueryLineageExtractor(config, report)
ddl = "CREATE VIEW my_view as select * from some_dataset.sometable as a"
view = BigqueryView(
name="test",
created=datetime.datetime.now(tz=datetime.timezone.utc),
last_altered=datetime.datetime.now(tz=datetime.timezone.utc),
comment="",
view_definition=ddl,
)
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
assert tables is not None
assert 1 == len(tables)
assert "my_project.some_dataset.sometable" == tables[0].get_table_name()
def test_one_part_table():
config = BigQueryV2Config()
report = BigQueryV2Report()
extractor = BigqueryLineageExtractor(config, report)
ddl = "CREATE VIEW my_view as select * from sometable as a"
view = BigqueryView(
name="test",
created=datetime.datetime.now(tz=datetime.timezone.utc),
last_altered=datetime.datetime.now(tz=datetime.timezone.utc),
comment="",
view_definition=ddl,
)
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
assert tables is not None
assert 1 == len(tables)
assert "my_project.my_dataset.sometable" == tables[0].get_table_name()
def test_create_statement_with_multiple_table():
config = BigQueryV2Config()
report = BigQueryV2Report()
extractor = BigqueryLineageExtractor(config, report)
ddl = "CREATE VIEW my_view as select * from my_project_2.my_dataset_2.sometable union select * from my_project_2.my_dataset_2.sometable2 as a"
view = BigqueryView(
name="test",
created=datetime.datetime.now(),
last_altered=datetime.datetime.now(),
comment="",
view_definition=ddl,
)
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
assert tables is not None
tables.sort(key=lambda e: e.get_table_name())
assert 2 == len(tables)
assert "my_project_2.my_dataset_2.sometable" == tables[0].get_table_name()
assert "my_project_2.my_dataset_2.sometable2" == tables[1].get_table_name()
def test_lineage_with_timestamps():
config = BigQueryV2Config()
report = BigQueryV2Report()
extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report)
lineage_entries: List[QueryEvent] = [
QueryEvent(
timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
actor_email="bla@bla.com",
query="testQuery",
statementType="SELECT",
project_id="proj_12344",
end_time=None,
referencedTables=[
BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_source_table1"
),
BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_source_table2"
),
],
destinationTable=BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_table"
),
),
QueryEvent(
timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
actor_email="bla@bla.com",
query="testQuery",
statementType="SELECT",
project_id="proj_12344",
end_time=datetime.datetime.fromtimestamp(
1617295943.17321, tz=datetime.timezone.utc
),
referencedTables=[
BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_source_table3"
),
],
destinationTable=BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_table"
),
),
QueryEvent(
timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
actor_email="bla@bla.com",
query="testQuery",
statementType="SELECT",
project_id="proj_12344",
referencedViews=[
BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_source_view1"
),
],
destinationTable=BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_table"
),
),
]
bq_table = BigQueryTableRef.from_string_name(
"projects/my_project/datasets/my_dataset/tables/my_table"
)
lineage_map: Dict[str, Set[LineageEdge]] = extractor._create_lineage_map(
iter(lineage_entries)
)
upstream_lineage = extractor.get_lineage_for_table(
bq_table=bq_table,
lineage_metadata=lineage_map,
platform="bigquery",
)
assert upstream_lineage
assert len(upstream_lineage[0].upstreams) == 4