2021-12-27 14:48:45 +01:00
|
|
|
import json
|
|
|
|
import os
|
2023-02-13 19:04:19 +01:00
|
|
|
from typing import Dict
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
from google.cloud.bigquery.table import TableListItem
|
2021-12-27 14:48:45 +01:00
|
|
|
|
2022-04-08 01:40:21 +05:30
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
2022-12-29 22:19:05 +01:00
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source
|
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
|
|
|
|
BigqueryTableIdentifier,
|
|
|
|
BigQueryTableRef,
|
|
|
|
)
|
|
|
|
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
|
2022-04-08 01:40:21 +05:30
|
|
|
|
2021-12-27 14:48:45 +01:00
|
|
|
|
|
|
|
def test_bigquery_uri():
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2021-12-27 14:48:45 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
assert config.get_sql_alchemy_url() == "bigquery://"
|
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_uri_on_behalf():
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{"project_id": "test-project", "project_on_behalf": "test-project-on-behalf"}
|
|
|
|
)
|
|
|
|
assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"
|
2021-12-27 14:48:45 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_bigquery_uri_with_credential():
|
|
|
|
expected_credential_json = {
|
|
|
|
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
|
|
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
|
|
"client_email": "test@acryl.io",
|
|
|
|
"client_id": "test_client-id",
|
|
|
|
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test@acryl.io",
|
|
|
|
"private_key": "random_private_key",
|
|
|
|
"private_key_id": "test-private-key",
|
|
|
|
"project_id": "test-project",
|
|
|
|
"token_uri": "https://oauth2.googleapis.com/token",
|
|
|
|
"type": "service_account",
|
|
|
|
}
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2021-12-27 14:48:45 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
"credential": {
|
|
|
|
"project_id": "test-project",
|
|
|
|
"private_key_id": "test-private-key",
|
|
|
|
"private_key": "random_private_key",
|
|
|
|
"client_email": "test@acryl.io",
|
|
|
|
"client_id": "test_client-id",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
2022-12-29 22:19:05 +01:00
|
|
|
assert config.get_sql_alchemy_url() == "bigquery://"
|
2022-03-08 21:29:10 +01:00
|
|
|
assert config._credentials_path
|
2021-12-27 14:48:45 +01:00
|
|
|
|
2022-03-08 21:29:10 +01:00
|
|
|
with open(config._credentials_path) as jsonFile:
|
2021-12-27 14:48:45 +01:00
|
|
|
json_credential = json.load(jsonFile)
|
|
|
|
jsonFile.close()
|
|
|
|
|
|
|
|
credential = json.dumps(json_credential, sort_keys=True)
|
|
|
|
expected_credential = json.dumps(expected_credential_json, sort_keys=True)
|
|
|
|
assert expected_credential == credential
|
|
|
|
|
|
|
|
except AssertionError as e:
|
2022-03-08 21:29:10 +01:00
|
|
|
if config._credentials_path:
|
|
|
|
os.unlink(str(config._credentials_path))
|
2021-12-27 14:48:45 +01:00
|
|
|
raise e
|
2022-02-20 23:23:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_simple_upstream_table_generation():
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
lineage_metadata = {str(a): {str(b)}}
|
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(
|
|
|
|
str(a), lineage_metadata, []
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
assert list(upstreams) == [b]
|
|
|
|
|
|
|
|
|
|
|
|
def test_upstream_table_generation_with_temporary_table_without_temp_upstream():
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_temp-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
|
|
|
|
lineage_metadata = {str(a): {str(b)}}
|
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(
|
|
|
|
str(a), lineage_metadata, []
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
assert list(upstreams) == []
|
|
|
|
|
|
|
|
|
|
|
|
def test_upstream_table_generation_with_temporary_table_with_temp_upstream():
|
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
|
|
|
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_temp-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
c: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="c"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
lineage_metadata = {
|
2023-02-13 19:04:19 +01:00
|
|
|
str(a): {str(b)},
|
|
|
|
str(b): {str(c)},
|
2022-12-29 22:19:05 +01:00
|
|
|
}
|
2023-02-13 19:59:11 +01:00
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(
|
|
|
|
str(a), lineage_metadata, []
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
assert list(upstreams) == [c]
|
|
|
|
|
|
|
|
|
|
|
|
def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream():
|
|
|
|
a: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="a"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
b: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_temp-dataset", table="b"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
c: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="c"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
d: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="_test-dataset", table="d"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
e: BigQueryTableRef = BigQueryTableRef(
|
2022-12-29 22:19:05 +01:00
|
|
|
BigqueryTableIdentifier(
|
|
|
|
project_id="test-project", dataset="test-dataset", table="e"
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
)
|
|
|
|
|
2022-12-29 22:19:05 +01:00
|
|
|
config = BigQueryV2Config.parse_obj(
|
2022-02-20 23:23:23 +01:00
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
2022-12-29 22:19:05 +01:00
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
2023-02-13 19:59:11 +01:00
|
|
|
lineage_metadata = {
|
2023-02-13 19:04:19 +01:00
|
|
|
str(a): {str(b)},
|
|
|
|
str(b): {str(c), str(d)},
|
|
|
|
str(d): {str(e)},
|
2022-02-20 23:23:23 +01:00
|
|
|
}
|
2023-02-13 19:59:11 +01:00
|
|
|
upstreams = source.lineage_extractor.get_upstream_tables(
|
|
|
|
str(a), lineage_metadata, []
|
|
|
|
)
|
2022-02-20 23:23:23 +01:00
|
|
|
assert list(upstreams).sort() == [c, e].sort()
|
2023-02-13 19:04:19 +01:00
|
|
|
|
|
|
|
|
|
|
|
@patch(
|
|
|
|
"datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
|
|
|
|
)
|
|
|
|
@patch("google.cloud.bigquery.client.Client")
|
|
|
|
def test_table_processing_logic(client_mock, data_dictionary_mock):
|
|
|
|
config = BigQueryV2Config.parse_obj(
|
|
|
|
{
|
|
|
|
"project_id": "test-project",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
tableListItems = [
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-table",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-sharded-table_20220102",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-sharded-table_20210101",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
TableListItem(
|
|
|
|
{
|
|
|
|
"tableReference": {
|
|
|
|
"projectId": "test-project",
|
|
|
|
"datasetId": "test-dataset",
|
|
|
|
"tableId": "test-sharded-table_20220101",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
client_mock.list_tables.return_value = tableListItems
|
|
|
|
data_dictionary_mock.get_tables_for_dataset.return_value = None
|
|
|
|
|
|
|
|
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
|
|
|
|
|
|
|
|
_ = source.get_tables_for_dataset(
|
|
|
|
conn=client_mock, project_id="test-project", dataset_name="test-dataset"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert data_dictionary_mock.call_count == 1
|
|
|
|
|
|
|
|
# args only available from python 3.8 and that's why call_args_list is sooo ugly
|
|
|
|
tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
|
|
|
|
3
|
|
|
|
] # alternatively
|
|
|
|
for table in tables.keys():
|
|
|
|
assert table in ["test-table", "test-sharded-table_20220102"]
|