datahub/metadata-ingestion/tests/unit/test_usage_common.py

262 lines
8.2 KiB
Python
Raw Normal View History

from datetime import datetime
import pytest
from pydantic import ValidationError
from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.time_window_config import BucketDuration, get_time_bucket
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.usage.usage_common import (
BaseUsageConfig,
GenericAggregatedDataset,
)
from datahub.metadata.schema_classes import DatasetUsageStatisticsClass
_TestTableRef = str
_TestAggregatedDataset = GenericAggregatedDataset[_TestTableRef]
def test_add_one_query_without_columns():
test_email = "test_email@test.com"
test_query = "select * from test"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
ta.add_read_entry(
test_email,
test_query,
[],
)
assert ta.queryCount == 1
assert ta.queryFreq[test_query] == 1
assert ta.userFreq[test_email] == 1
assert len(ta.columnFreq) == 0
def test_add_one_query_with_ignored_user():
test_email = "test_email@test.com"
test_query = "select * from test"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(
bucket_start_time=floored_ts,
resource=resource,
user_email_pattern=AllowDenyPattern(deny=list(["test_email@test.com"])),
)
ta.add_read_entry(
test_email,
test_query,
[],
)
assert ta.queryCount == 0
assert ta.queryFreq[test_query] == 0
assert ta.userFreq[test_email] == 0
assert len(ta.columnFreq) == 0
def test_multiple_query_with_ignored_user():
test_email = "test_email@test.com"
test_email2 = "test_email2@test.com"
test_query = "select * from test"
test_query2 = "select * from test2"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(
bucket_start_time=floored_ts,
resource=resource,
user_email_pattern=AllowDenyPattern(deny=list(["test_email@test.com"])),
)
ta.add_read_entry(
test_email,
test_query,
[],
)
ta.add_read_entry(
test_email,
test_query,
[],
)
ta.add_read_entry(
test_email2,
test_query2,
[],
)
assert ta.queryCount == 1
assert ta.queryFreq[test_query] == 0
assert ta.userFreq[test_email] == 0
assert ta.queryFreq[test_query2] == 1
assert ta.userFreq[test_email2] == 1
assert len(ta.columnFreq) == 0
def test_multiple_query_without_columns():
test_email = "test_email@test.com"
test_email2 = "test_email2@test.com"
test_query = "select * from test"
test_query2 = "select * from test2"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
ta.add_read_entry(
test_email,
test_query,
[],
)
ta.add_read_entry(
test_email,
test_query,
[],
)
ta.add_read_entry(
test_email2,
test_query2,
[],
)
assert ta.queryCount == 3
assert ta.queryFreq[test_query] == 2
assert ta.userFreq[test_email] == 2
assert ta.queryFreq[test_query2] == 1
assert ta.userFreq[test_email2] == 1
assert len(ta.columnFreq) == 0
def test_make_usage_workunit():
test_email = "test_email@test.com"
test_query = "select * from test"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
ta.add_read_entry(
test_email,
test_query,
[],
)
wu: MetadataWorkUnit = ta.make_usage_workunit(
bucket_duration=BucketDuration.DAY,
urn_builder=lambda x: x,
top_n_queries=10,
format_sql_queries=False,
include_top_n_queries=True,
)
assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper)
du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
assert du.totalSqlQueries == 1
assert du.topSqlQueries
assert du.topSqlQueries.pop() == test_query
def test_query_formatting():
test_email = "test_email@test.com"
test_query = "select * from foo where id in (select id from bar);"
formatted_test_query: str = "SELECT *\n FROM foo\n WHERE id in (\n SELECT id\n FROM bar\n );"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
ta.add_read_entry(
test_email,
test_query,
[],
)
wu: MetadataWorkUnit = ta.make_usage_workunit(
bucket_duration=BucketDuration.DAY,
urn_builder=lambda x: x,
top_n_queries=10,
format_sql_queries=True,
include_top_n_queries=True,
)
assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper)
du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
assert du.totalSqlQueries == 1
assert du.topSqlQueries
assert du.topSqlQueries.pop() == formatted_test_query
def test_query_trimming():
test_email: str = "test_email@test.com"
test_query: str = "select * from test where a > 10 and b > 20 order by a asc"
top_n_queries: int = 10
total_budget_for_query_list: int = 200
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
ta.total_budget_for_query_list = total_budget_for_query_list
ta.add_read_entry(
test_email,
test_query,
[],
)
wu: MetadataWorkUnit = ta.make_usage_workunit(
bucket_duration=BucketDuration.DAY,
urn_builder=lambda x: x,
top_n_queries=top_n_queries,
format_sql_queries=False,
include_top_n_queries=True,
)
assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper)
du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
assert du.totalSqlQueries == 1
assert du.topSqlQueries
assert du.topSqlQueries.pop() == "select * f ..."
def test_top_n_queries_validator_fails():
with pytest.raises(ValidationError) as excinfo:
GenericAggregatedDataset.total_budget_for_query_list = 20
BaseUsageConfig(top_n_queries=2)
assert "top_n_queries is set to 2 but it can be maximum 1" in str(excinfo.value)
def test_make_usage_workunit_include_top_n_queries():
test_email = "test_email@test.com"
test_query = "select * from test"
event_time = datetime(2020, 1, 1)
floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
resource = "test_db.test_schema.test_table"
ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
ta.add_read_entry(
test_email,
test_query,
[],
)
wu: MetadataWorkUnit = ta.make_usage_workunit(
bucket_duration=BucketDuration.DAY,
urn_builder=lambda x: x,
top_n_queries=10,
format_sql_queries=False,
include_top_n_queries=False,
)
assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper)
du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
assert du.totalSqlQueries == 1
assert du.topSqlQueries is None