2021-12-27 14:48:45 +01:00
|
|
|
import json
|
|
|
|
import os
|
2023-02-23 00:53:35 +01:00
|
|
|
from datetime import datetime
|
2023-02-23 19:40:00 -05:00
|
|
|
from types import SimpleNamespace
|
2023-02-13 19:04:19 +01:00
|
|
|
from typing import Dict
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
from google.cloud.bigquery.table import TableListItem
|
2021-12-27 14:48:45 +01:00
|
|
|
|
2022-04-08 01:40:21 +05:30
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
2022-12-29 22:19:05 +01:00
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source
|
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
|
|
|
|
BigqueryTableIdentifier,
|
|
|
|
BigQueryTableRef,
|
|
|
|
)
|
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
|
2023-02-23 19:40:00 -05:00
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigqueryProject
|
2023-02-23 00:53:35 +01:00
|
|
|
from datahub.ingestion.source.bigquery_v2.lineage import LineageEdge
|
2022-04-08 01:40:21 +05:30
|
|
|
|
2021-12-27 14:48:45 +01:00
|
|
|
|
|
|
|
def test_bigquery_uri():
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2021-12-27 14:48:45 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
assert config.get_sql_alchemy_url() == "bigquery://"
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_uri_on_behalf():
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{"project_id": "test-project", "project_on_behalf": "test-project-on-behalf"}
|
|
|
|
)
|
|
|
|
assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"
|
2021-12-27 14:48:45 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_uri_with_credential():
|
|
|
|
expected_credential_json = {
|
|
|
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
|
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
|
|
"client_email": "test@acryl.io",
|
|
|
|
"client_id": "test_client-id",
|
|
|
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test@acryl.io",
|
|
|
|
"private_key": "random_private_key",
|
|
|
|
"private_key_id": "test-private-key",
|
|
|
|
"project_id": "test-project",
|
|
|
|
"token_uri": "https://oauth2.googleapis.com/token",
|
|
|
|
"type": "service_account",
|
|
|
|
}
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2021-12-27 14:48:45 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
"credential": {
|
|
|
|
"project_id": "test-project",
|
|
|
|
"private_key_id": "test-private-key",
|
|
|
|
"private_key": "random_private_key",
|
|
|
|
"client_email": "test@acryl.io",
|
|
|
|
"client_id": "test_client-id",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
2022-12-29 22:19:05 +01:00
|
|
|
assert config.get_sql_alchemy_url() == "bigquery://"
|
2022-03-08 21:29:10 +01:00
|
|
|
assert config._credentials_path
|
2021-12-27 14:48:45 +01:00
|
|
|
|
2022-03-08 21:29:10 +01:00
|
|
|
with open(config._credentials_path) as jsonFile:
|
2021-12-27 14:48:45 +01:00
|
|
|
json_credential = json.load(jsonFile)
|
|
|
|
jsonFile.close()
|
|
|
|
|
|
|
|
credential = json.dumps(json_credential, sort_keys=True)
|
|
|
|
expected_credential = json.dumps(expected_credential_json, sort_keys=True)
|
|
|
|
assert expected_credential == credential
|
|
|
|
|
|
|
|
except AssertionError as e:
|
2022-03-08 21:29:10 +01:00
|
|
|
if config._credentials_path:
|
|
|
|
os.unlink(str(config._credentials_path))
|
2021-12-27 14:48:45 +01:00
|
|
|
raise e
|
2022-02-20 23:23:23 +01:00
|
|
|
|
|
|
|
|
2023-02-23 19:40:00 -05:00
|
|
|
@patch("google.cloud.bigquery.client.Client")
|
|
|
|
def test_get_projects_with_project_ids(client_mock):
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{
|
|
|
|
"project_ids": ["test-1", "test-2"],
|
|
|
|
}
|
|
|
|
)
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
|
|
|
|
assert source._get_projects(client_mock) == [
|
|
|
|
BigqueryProject("test-1", "test-1"),
|
|
|
|
BigqueryProject("test-2", "test-2"),
|
|
|
|
]
|
|
|
|
assert client_mock.list_projects.call_count == 0
|
|
|
|
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{"project_ids": ["test-1", "test-2"], "project_id": "test-3"}
|
|
|
|
)
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test2"))
|
|
|
|
assert source._get_projects(client_mock) == [
|
|
|
|
BigqueryProject("test-1", "test-1"),
|
|
|
|
BigqueryProject("test-2", "test-2"),
|
|
|
|
]
|
|
|
|
assert client_mock.list_projects.call_count == 0
|
|
|
|
|
|
|
|
|
|
|
|
@patch("google.cloud.bigquery.client.Client")
|
|
|
|
def test_get_projects_with_single_project_id(client_mock):
|
|
|
|
config = BigQueryV2Config.parse_obj({"project_id": "test-3"})
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
|
|
|
|
assert source._get_projects(client_mock) == [
|
|
|
|
BigqueryProject("test-3", "test-3"),
|
|
|
|
]
|
|
|
|
assert client_mock.list_projects.call_count == 0
|
|
|
|
|
|
|
|
|
|
|
|
@patch("google.cloud.bigquery.client.Client")
|
|
|
|
def test_get_projects(client_mock):
|
|
|
|
client_mock.list_projects.return_value = [
|
|
|
|
SimpleNamespace(
|
|
|
|
project_id="test-1",
|
|
|
|
friendly_name="one",
|
|
|
|
),
|
|
|
|
SimpleNamespace(
|
|
|
|
project_id="test-2",
|
|
|
|
friendly_name="two",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
config = BigQueryV2Config.parse_obj({})
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
|
|
|
|
assert source._get_projects(client_mock) == [
|
|
|
|
BigqueryProject("test-1", "one"),
|
|
|
|
BigqueryProject("test-2", "two"),
|
|
|
|
]
|
|
|
|
assert client_mock.list_projects.call_count == 1
|
|
|
|
|
|
|
|
|
2022-02-20 23:23:23 +01:00
|
|
|
def test_simple_upstream_table_generation():
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-23 19:40:00 -05:00
|
|
|
lineage_metadata = {str(a): {LineageEdge(table=str(b), auditStamp=datetime.now())}}
|
2023-02-23 00:53:35 +01:00
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(a, lineage_metadata, [])
|
|
|
|
|
|
|
|
assert len(upstreams) == 1
|
|
|
|
assert list(upstreams)[0].table == str(b)
|
2022-02-20 23:23:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_upstream_table_generation_with_temporary_table_without_temp_upstream():
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_temp-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
|
2023-02-23 19:40:00 -05:00
|
|
|
lineage_metadata = {str(a): {LineageEdge(table=str(b), auditStamp=datetime.now())}}
|
2023-02-23 00:53:35 +01:00
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(a, lineage_metadata, [])
|
2022-02-20 23:23:23 +01:00
|
|
|
assert list(upstreams) == []
|
|
|
|
|
|
|
|
|
|
|
|
def test_upstream_table_generation_with_temporary_table_with_temp_upstream():
|
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
|
|
|
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_temp-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
c: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="c"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2023-02-23 00:53:35 +01:00
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
lineage_metadata = {
|
2023-02-23 19:40:00 -05:00
|
|
|
str(a): {LineageEdge(table=str(b), auditStamp=datetime.now())},
|
|
|
|
str(b): {LineageEdge(table=str(c), auditStamp=datetime.now())},
|
2022-12-29 22:19:05 +01:00
|
|
|
}
|
2023-02-23 00:53:35 +01:00
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(a, lineage_metadata, [])
|
|
|
|
assert len(upstreams) == 1
|
|
|
|
assert list(upstreams)[0].table == str(c)
|
2022-02-20 23:23:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream():
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_temp-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
c: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="c"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
d: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_test-dataset", table="d"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
e: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="e"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
lineage_metadata = {
|
2023-02-23 19:40:00 -05:00
|
|
|
str(a): {LineageEdge(table=str(b), auditStamp=datetime.now())},
|
2023-02-23 00:53:35 +01:00
|
|
|
str(b): {
|
2023-02-23 19:40:00 -05:00
|
|
|
LineageEdge(table=str(c), auditStamp=datetime.now()),
|
|
|
|
LineageEdge(table=str(d), auditStamp=datetime.now()),
|
2023-02-23 00:53:35 +01:00
|
|
|
},
|
2023-02-23 19:40:00 -05:00
|
|
|
str(d): {LineageEdge(table=str(e), auditStamp=datetime.now())},
|
2022-02-20 23:23:23 +01:00
|
|
|
}
|
2023-02-23 00:53:35 +01:00
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(a, lineage_metadata, [])
|
|
|
|
sorted_list = list(upstreams)
|
|
|
|
sorted_list.sort()
|
|
|
|
assert sorted_list[0].table == str(c)
|
|
|
|
assert sorted_list[1].table == str(e)
|
2023-02-13 19:04:19 +01:00
|
|
|
|
|
|
|
|
|
|
|
@patch(
|
|
|
|
"datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
|
|
|
|
)
|
|
|
|
@patch("google.cloud.bigquery.client.Client")
|
|
|
|
def test_table_processing_logic(client_mock, data_dictionary_mock):
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
tableListItems = [
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-table",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-sharded-table_20220102",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-sharded-table_20210101",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-sharded-table_20220101",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
client_mock.list_tables.return_value = tableListItems
|
|
|
|
data_dictionary_mock.get_tables_for_dataset.return_value = None
|
|
|
|
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
|
|
|
|
|
|
|
_ = source.get_tables_for_dataset(
|
|
|
|
conn=client_mock, project_id="test-project", dataset_name="test-dataset"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert data_dictionary_mock.call_count == 1
|
|
|
|
|
|
|
|
# args only available from python 3.8 and that's why call_args_list is sooo ugly
|
|
|
|
tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
|
|
|
|
3
|
|
|
|
] # alternatively
|
|
|
|
for table in tables.keys():
|
|
|
|
assert table in ["test-table", "test-sharded-table_20220102"]
|
2023-02-27 11:10:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
@patch(
|
|
|
|
"datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
|
|
|
|
)
|
|
|
|
@patch("google.cloud.bigquery.client.Client")
|
|
|
|
def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock):
|
|
|
|
# test that tables with date names are processed correctly
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
tableListItems = [
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-table",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "20220102",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "20210101",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "20220103",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
client_mock.list_tables.return_value = tableListItems
|
|
|
|
data_dictionary_mock.get_tables_for_dataset.return_value = None
|
|
|
|
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
|
|
|
|
|
|
|
_ = source.get_tables_for_dataset(
|
|
|
|
conn=client_mock, project_id="test-project", dataset_name="test-dataset"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert data_dictionary_mock.call_count == 1
|
|
|
|
|
|
|
|
# args only available from python 3.8 and that's why call_args_list is sooo ugly
|
|
|
|
tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
|
|
|
|
3
|
|
|
|
] # alternatively
|
|
|
|
for table in tables.keys():
|
|
|
|
assert tables[table].table_id in ["test-table", "20220103"]
|