2022-12-28 21:50:37 +05:30
|
|
|
import random
|
|
|
|
import string
|
|
|
|
from datetime import datetime, timezone
|
2023-02-20 08:46:10 -05:00
|
|
|
from typing import cast
|
2022-12-28 21:50:37 +05:30
|
|
|
from unittest import mock
|
|
|
|
|
|
|
|
import pandas as pd
|
2023-02-15 16:32:53 +05:30
|
|
|
import pytest
|
2022-12-28 21:50:37 +05:30
|
|
|
from freezegun import freeze_time
|
|
|
|
|
|
|
|
from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig
|
|
|
|
from datahub.ingestion.glossary.classifier import (
|
|
|
|
ClassificationConfig,
|
|
|
|
DynamicTypedClassifierConfig,
|
|
|
|
)
|
|
|
|
from datahub.ingestion.glossary.datahub_classifier import (
|
|
|
|
DataHubClassifierConfig,
|
|
|
|
InfoTypeConfig,
|
|
|
|
PredictionFactorsAndWeights,
|
2023-06-06 18:11:51 +05:30
|
|
|
ValuesFactorConfig,
|
2022-12-28 21:50:37 +05:30
|
|
|
)
|
|
|
|
from datahub.ingestion.run.pipeline import Pipeline
|
|
|
|
from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
|
|
|
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
2023-01-04 23:05:23 +02:00
|
|
|
from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
|
SnowflakeV2Config,
|
|
|
|
TagOption,
|
|
|
|
)
|
2023-02-20 08:46:10 -05:00
|
|
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
2022-12-28 21:50:37 +05:30
|
|
|
from tests.integration.snowflake.common import FROZEN_TIME, default_query_results
|
|
|
|
from tests.test_helpers import mce_helpers
|
|
|
|
|
|
|
|
|
|
|
|
def random_email():
|
|
|
|
return (
|
|
|
|
"".join(
|
|
|
|
[
|
|
|
|
random.choice(string.ascii_lowercase)
|
|
|
|
for i in range(random.randint(10, 15))
|
|
|
|
]
|
|
|
|
)
|
|
|
|
+ "@xyz.com"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-06-06 18:11:51 +05:30
|
|
|
def random_cloud_region():
|
|
|
|
return "".join(
|
|
|
|
[
|
|
|
|
random.choice(["af", "ap", "ca", "eu", "me", "sa", "us"]),
|
|
|
|
"-",
|
|
|
|
random.choice(["central", "north", "south", "east", "west"]),
|
|
|
|
"-",
|
|
|
|
str(random.randint(1, 2)),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-12-28 21:50:37 +05:30
|
|
|
@freeze_time(FROZEN_TIME)
|
2023-02-15 16:32:53 +05:30
|
|
|
@pytest.mark.integration
|
2022-12-28 21:50:37 +05:30
|
|
|
def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
|
|
|
|
test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake"
|
|
|
|
|
|
|
|
# Run the metadata ingestion pipeline.
|
|
|
|
output_file = tmp_path / "snowflake_test_events.json"
|
|
|
|
golden_file = test_resources_dir / "snowflake_golden.json"
|
|
|
|
|
|
|
|
with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch(
|
|
|
|
"datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table"
|
|
|
|
) as mock_sample_values:
|
|
|
|
sf_connection = mock.MagicMock()
|
|
|
|
sf_cursor = mock.MagicMock()
|
|
|
|
mock_connect.return_value = sf_connection
|
|
|
|
sf_connection.cursor.return_value = sf_cursor
|
|
|
|
|
|
|
|
sf_cursor.execute.side_effect = default_query_results
|
|
|
|
|
|
|
|
mock_sample_values.return_value = pd.DataFrame(
|
|
|
|
data={
|
2023-06-12 17:37:03 +05:30
|
|
|
"col_1": [random.randint(1, 80) for i in range(20)],
|
2023-06-08 09:58:13 +05:30
|
|
|
"col_2": [random_email() for i in range(20)],
|
|
|
|
"col_3": [random_cloud_region() for i in range(20)],
|
2022-12-28 21:50:37 +05:30
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2023-06-12 17:37:03 +05:30
|
|
|
datahub_classifier_config = DataHubClassifierConfig(
|
|
|
|
minimum_values_threshold=10,
|
|
|
|
confidence_level_threshold=0.58,
|
|
|
|
info_types_config={
|
|
|
|
"Age": InfoTypeConfig(
|
|
|
|
Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
|
|
|
|
Name=0, Values=1, Description=0, Datatype=0
|
|
|
|
)
|
2023-06-06 18:11:51 +05:30
|
|
|
),
|
2023-06-12 17:37:03 +05:30
|
|
|
"CloudRegion": InfoTypeConfig(
|
|
|
|
Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
|
|
|
|
Name=0,
|
|
|
|
Description=0,
|
|
|
|
Datatype=0,
|
|
|
|
Values=1,
|
|
|
|
),
|
|
|
|
Values=ValuesFactorConfig(
|
|
|
|
prediction_type="regex",
|
|
|
|
regex=[
|
|
|
|
r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+"
|
|
|
|
],
|
|
|
|
),
|
2023-06-06 18:11:51 +05:30
|
|
|
),
|
2023-06-12 17:37:03 +05:30
|
|
|
},
|
|
|
|
)
|
|
|
|
|
2022-12-28 21:50:37 +05:30
|
|
|
pipeline = Pipeline(
|
|
|
|
config=PipelineConfig(
|
|
|
|
source=SourceConfig(
|
|
|
|
type="snowflake",
|
|
|
|
config=SnowflakeV2Config(
|
|
|
|
account_id="ABC12345.ap-south-1.aws",
|
|
|
|
username="TST_USR",
|
|
|
|
password="TST_PWD",
|
|
|
|
match_fully_qualified_names=True,
|
|
|
|
schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
|
|
|
|
include_technical_schema=True,
|
|
|
|
include_table_lineage=True,
|
|
|
|
include_view_lineage=True,
|
2023-06-22 17:07:50 -04:00
|
|
|
include_usage_stats=True,
|
2023-04-07 23:36:06 +05:30
|
|
|
use_legacy_lineage_method=False,
|
2023-04-24 23:31:15 +05:30
|
|
|
validate_upstreams_against_patterns=False,
|
2022-12-28 21:50:37 +05:30
|
|
|
include_operational_stats=True,
|
|
|
|
start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
|
|
|
|
tzinfo=timezone.utc
|
|
|
|
),
|
|
|
|
end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(
|
|
|
|
tzinfo=timezone.utc
|
|
|
|
),
|
|
|
|
classification=ClassificationConfig(
|
|
|
|
enabled=True,
|
|
|
|
column_pattern=AllowDenyPattern(
|
2023-06-06 18:11:51 +05:30
|
|
|
allow=[".*col_1$", ".*col_2$", ".*col_3$"]
|
2022-12-28 21:50:37 +05:30
|
|
|
),
|
|
|
|
classifiers=[
|
|
|
|
DynamicTypedClassifierConfig(
|
|
|
|
type="datahub", config=datahub_classifier_config
|
|
|
|
)
|
|
|
|
],
|
|
|
|
),
|
|
|
|
profiling=GEProfilingConfig(
|
|
|
|
enabled=True,
|
|
|
|
profile_if_updated_since_days=None,
|
|
|
|
profile_table_row_limit=None,
|
|
|
|
profile_table_size_limit=None,
|
|
|
|
profile_table_level_only=True,
|
|
|
|
),
|
2023-01-04 23:05:23 +02:00
|
|
|
extract_tags=TagOption.without_lineage,
|
2022-12-28 21:50:37 +05:30
|
|
|
),
|
|
|
|
),
|
|
|
|
sink=DynamicTypedConfig(
|
|
|
|
type="file", config={"filename": str(output_file)}
|
|
|
|
),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
pipeline.run()
|
|
|
|
pipeline.pretty_print_summary()
|
|
|
|
pipeline.raise_from_status()
|
|
|
|
|
|
|
|
# Verify the output.
|
|
|
|
|
|
|
|
mce_helpers.check_golden_file(
|
|
|
|
pytestconfig,
|
|
|
|
output_path=output_file,
|
|
|
|
golden_path=golden_file,
|
|
|
|
ignore_paths=[],
|
|
|
|
)
|
2023-02-20 08:46:10 -05:00
|
|
|
report = cast(SnowflakeV2Report, pipeline.source.get_report())
|
|
|
|
assert report.lru_cache_info["get_tables_for_database"]["misses"] == 1
|
|
|
|
assert report.lru_cache_info["get_views_for_database"]["misses"] == 1
|
|
|
|
assert report.lru_cache_info["get_columns_for_schema"]["misses"] == 1
|
|
|
|
assert report.lru_cache_info["get_pk_constraints_for_schema"]["misses"] == 1
|
|
|
|
assert report.lru_cache_info["get_fk_constraints_for_schema"]["misses"] == 1
|
2022-12-28 21:50:37 +05:30
|
|
|
|
|
|
|
|
|
|
|
@freeze_time(FROZEN_TIME)
|
2023-02-15 16:32:53 +05:30
|
|
|
@pytest.mark.integration
|
2022-12-28 21:50:37 +05:30
|
|
|
def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
|
|
|
|
test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake"
|
|
|
|
|
|
|
|
# Run the metadata ingestion pipeline.
|
|
|
|
output_file = tmp_path / "snowflake_privatelink_test_events.json"
|
|
|
|
golden_file = test_resources_dir / "snowflake_privatelink_golden.json"
|
|
|
|
|
|
|
|
with mock.patch("snowflake.connector.connect") as mock_connect:
|
|
|
|
sf_connection = mock.MagicMock()
|
|
|
|
sf_cursor = mock.MagicMock()
|
|
|
|
mock_connect.return_value = sf_connection
|
|
|
|
sf_connection.cursor.return_value = sf_cursor
|
|
|
|
sf_cursor.execute.side_effect = default_query_results
|
|
|
|
|
|
|
|
pipeline = Pipeline(
|
|
|
|
config=PipelineConfig(
|
|
|
|
source=SourceConfig(
|
|
|
|
type="snowflake",
|
|
|
|
config=SnowflakeV2Config(
|
|
|
|
account_id="ABC12345.ap-south-1.privatelink",
|
|
|
|
username="TST_USR",
|
|
|
|
password="TST_PWD",
|
|
|
|
schema_pattern=AllowDenyPattern(allow=["test_schema"]),
|
|
|
|
include_technical_schema=True,
|
|
|
|
include_table_lineage=True,
|
|
|
|
include_column_lineage=False,
|
|
|
|
include_views=False,
|
|
|
|
include_view_lineage=False,
|
2023-04-07 23:36:06 +05:30
|
|
|
use_legacy_lineage_method=False,
|
2022-12-28 21:50:37 +05:30
|
|
|
include_usage_stats=False,
|
|
|
|
include_operational_stats=False,
|
|
|
|
start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
|
|
|
|
tzinfo=timezone.utc
|
|
|
|
),
|
|
|
|
end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(
|
|
|
|
tzinfo=timezone.utc
|
|
|
|
),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
sink=DynamicTypedConfig(
|
|
|
|
type="file", config={"filename": str(output_file)}
|
|
|
|
),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
pipeline.run()
|
|
|
|
pipeline.pretty_print_summary()
|
|
|
|
pipeline.raise_from_status()
|
|
|
|
|
|
|
|
# Verify the output.
|
|
|
|
|
|
|
|
mce_helpers.check_golden_file(
|
|
|
|
pytestconfig,
|
|
|
|
output_path=output_file,
|
|
|
|
golden_path=golden_file,
|
|
|
|
ignore_paths=[],
|
|
|
|
)
|