mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-29 20:39:56 +00:00
177 lines
6.2 KiB
Python
177 lines
6.2 KiB
Python
import datetime
|
|
from typing import Dict, List, Set
|
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
|
|
BigQueryTableRef,
|
|
QueryEvent,
|
|
)
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigqueryView
|
|
from datahub.ingestion.source.bigquery_v2.lineage import (
|
|
BigqueryLineageExtractor,
|
|
LineageEdge,
|
|
)
|
|
|
|
|
|
def test_parse_view_lineage():
|
|
config = BigQueryV2Config()
|
|
report = BigQueryV2Report()
|
|
extractor = BigqueryLineageExtractor(config, report)
|
|
|
|
# ddl = "select * from some_dataset.sometable as a"
|
|
ddl = """CREATE VIEW `my-project.my-dataset.test_table`
|
|
AS SELECT
|
|
* REPLACE(
|
|
myrandom(something) AS something)
|
|
FROM
|
|
`my-project2.my-dataset2.test_physical_table`;
|
|
"""
|
|
view = BigqueryView(
|
|
name="test",
|
|
created=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
last_altered=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
comment="",
|
|
view_definition=ddl,
|
|
)
|
|
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
|
|
assert tables is not None
|
|
assert 1 == len(tables)
|
|
assert "my-project2.my-dataset2.test_physical_table" == tables[0].get_table_name()
|
|
|
|
|
|
def test_parse_view_lineage_with_two_part_table_name():
|
|
config = BigQueryV2Config()
|
|
report = BigQueryV2Report()
|
|
extractor = BigqueryLineageExtractor(config, report)
|
|
|
|
ddl = "CREATE VIEW my_view as select * from some_dataset.sometable as a"
|
|
view = BigqueryView(
|
|
name="test",
|
|
created=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
last_altered=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
comment="",
|
|
view_definition=ddl,
|
|
)
|
|
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
|
|
assert tables is not None
|
|
assert 1 == len(tables)
|
|
assert "my_project.some_dataset.sometable" == tables[0].get_table_name()
|
|
|
|
|
|
def test_one_part_table():
|
|
config = BigQueryV2Config()
|
|
report = BigQueryV2Report()
|
|
extractor = BigqueryLineageExtractor(config, report)
|
|
|
|
ddl = "CREATE VIEW my_view as select * from sometable as a"
|
|
view = BigqueryView(
|
|
name="test",
|
|
created=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
last_altered=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
comment="",
|
|
view_definition=ddl,
|
|
)
|
|
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
|
|
assert tables is not None
|
|
assert 1 == len(tables)
|
|
assert "my_project.my_dataset.sometable" == tables[0].get_table_name()
|
|
|
|
|
|
def test_create_statement_with_multiple_table():
|
|
config = BigQueryV2Config()
|
|
report = BigQueryV2Report()
|
|
extractor = BigqueryLineageExtractor(config, report)
|
|
|
|
ddl = "CREATE VIEW my_view as select * from my_project_2.my_dataset_2.sometable union select * from my_project_2.my_dataset_2.sometable2 as a"
|
|
view = BigqueryView(
|
|
name="test",
|
|
created=datetime.datetime.now(),
|
|
last_altered=datetime.datetime.now(),
|
|
comment="",
|
|
view_definition=ddl,
|
|
)
|
|
tables = extractor.parse_view_lineage("my_project", "my_dataset", view)
|
|
assert tables is not None
|
|
|
|
tables.sort(key=lambda e: e.get_table_name())
|
|
assert 2 == len(tables)
|
|
assert "my_project_2.my_dataset_2.sometable" == tables[0].get_table_name()
|
|
assert "my_project_2.my_dataset_2.sometable2" == tables[1].get_table_name()
|
|
|
|
|
|
def test_lineage_with_timestamps():
|
|
config = BigQueryV2Config()
|
|
report = BigQueryV2Report()
|
|
extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report)
|
|
lineage_entries: List[QueryEvent] = [
|
|
QueryEvent(
|
|
timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
actor_email="bla@bla.com",
|
|
query="testQuery",
|
|
statementType="SELECT",
|
|
project_id="proj_12344",
|
|
end_time=None,
|
|
referencedTables=[
|
|
BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_source_table1"
|
|
),
|
|
BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_source_table2"
|
|
),
|
|
],
|
|
destinationTable=BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_table"
|
|
),
|
|
),
|
|
QueryEvent(
|
|
timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
actor_email="bla@bla.com",
|
|
query="testQuery",
|
|
statementType="SELECT",
|
|
project_id="proj_12344",
|
|
end_time=datetime.datetime.fromtimestamp(
|
|
1617295943.17321, tz=datetime.timezone.utc
|
|
),
|
|
referencedTables=[
|
|
BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_source_table3"
|
|
),
|
|
],
|
|
destinationTable=BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_table"
|
|
),
|
|
),
|
|
QueryEvent(
|
|
timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
|
|
actor_email="bla@bla.com",
|
|
query="testQuery",
|
|
statementType="SELECT",
|
|
project_id="proj_12344",
|
|
referencedViews=[
|
|
BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_source_view1"
|
|
),
|
|
],
|
|
destinationTable=BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_table"
|
|
),
|
|
),
|
|
]
|
|
|
|
bq_table = BigQueryTableRef.from_string_name(
|
|
"projects/my_project/datasets/my_dataset/tables/my_table"
|
|
)
|
|
|
|
lineage_map: Dict[str, Set[LineageEdge]] = extractor._create_lineage_map(
|
|
iter(lineage_entries)
|
|
)
|
|
|
|
upstream_lineage = extractor.get_lineage_for_table(
|
|
bq_table=bq_table,
|
|
lineage_metadata=lineage_map,
|
|
platform="bigquery",
|
|
)
|
|
assert upstream_lineage
|
|
assert len(upstream_lineage[0].upstreams) == 4
|