mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-07 16:57:52 +00:00
429 lines
17 KiB
Python
429 lines
17 KiB
Python
![]() |
from datetime import datetime, timezone
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from datahub.ingestion.source.sql_queries import QueryEntry, SqlQueriesSourceConfig
|
||
|
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
||
|
|
||
|
|
||
|
class TestQueryEntry:
|
||
|
@pytest.mark.parametrize(
|
||
|
"entry_dict,entry_config,expected_query_entry,should_raise",
|
||
|
[
|
||
|
# Timestamp format variations
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="numeric_unix_timestamp",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200.5,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(
|
||
|
2021, 1, 1, 0, 0, 0, 500000, tzinfo=timezone.utc
|
||
|
),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="float_unix_timestamp",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": "1609459200",
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="string_unix_timestamp",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": "2021-01-01T00:00:00Z",
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="iso_format_with_z",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": "2025-07-19 15:08:07.000000",
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2025, 7, 19, 15, 8, 7, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="datetime_string_with_microseconds",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": "2025-07-19 15:08:07",
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2025, 7, 19, 15, 8, 7, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="datetime_string_without_microseconds",
|
||
|
),
|
||
|
# Table filtering - empty strings removed
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1", "", "table2"],
|
||
|
"downstream_tables": ["output_table"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,output_table,PROD)"
|
||
|
)
|
||
|
],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
),
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table2,PROD)"
|
||
|
),
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="filter_empty_upstream_tables",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": [""],
|
||
|
"downstream_tables": [""],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="all_empty_tables",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": [" ", "\t"],
|
||
|
"downstream_tables": [" "],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="whitespace_only_tables",
|
||
|
),
|
||
|
# Missing fields
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=None,
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="no_timestamp",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=None,
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="no_user",
|
||
|
),
|
||
|
# Both upstream and downstream tables
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "CREATE TABLE out AS SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
"downstream_tables": ["output_table"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
QueryEntry(
|
||
|
query="CREATE TABLE out AS SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,output_table,PROD)"
|
||
|
)
|
||
|
],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:athena,table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="upstream_and_downstream_tables",
|
||
|
),
|
||
|
# Config variations - platform, platform_instance, env
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(
|
||
|
platform="snowflake",
|
||
|
query_file="dummy.json",
|
||
|
platform_instance="prod_instance",
|
||
|
env="DEV",
|
||
|
),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:snowflake,prod_instance.table1,DEV)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="config_with_platform_instance_and_env",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(
|
||
|
platform="bigquery", query_file="dummy.json", env="PRE"
|
||
|
),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:bigquery,table1,PRE)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="config_with_different_env",
|
||
|
),
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": 1609459200,
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(
|
||
|
platform="postgres",
|
||
|
query_file="dummy.json",
|
||
|
platform_instance="dev_cluster",
|
||
|
),
|
||
|
QueryEntry(
|
||
|
query="SELECT * FROM table",
|
||
|
timestamp=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
|
||
|
user=CorpUserUrn("test_user"),
|
||
|
operation_type=None,
|
||
|
downstream_tables=[],
|
||
|
upstream_tables=[
|
||
|
DatasetUrn.from_string(
|
||
|
"urn:li:dataset:(urn:li:dataPlatform:postgres,dev_cluster.table1,PROD)"
|
||
|
)
|
||
|
],
|
||
|
session_id=None,
|
||
|
),
|
||
|
False,
|
||
|
id="config_with_platform_instance_only",
|
||
|
),
|
||
|
# Error cases
|
||
|
pytest.param(
|
||
|
{
|
||
|
"query": "SELECT * FROM table",
|
||
|
"timestamp": "invalid-timestamp",
|
||
|
"user": "test_user",
|
||
|
"upstream_tables": ["table1"],
|
||
|
},
|
||
|
SqlQueriesSourceConfig(platform="athena", query_file="dummy.json"),
|
||
|
None,
|
||
|
True,
|
||
|
id="invalid_timestamp_format",
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_create(self, entry_dict, entry_config, expected_query_entry, should_raise):
|
||
|
"""Test QueryEntry creation with various input formats and edge cases."""
|
||
|
|
||
|
if should_raise:
|
||
|
with pytest.raises(ValueError):
|
||
|
QueryEntry.create(entry_dict, config=entry_config)
|
||
|
return
|
||
|
|
||
|
query_entry = QueryEntry.create(entry_dict, config=entry_config)
|
||
|
|
||
|
# Check all fields match expected
|
||
|
assert query_entry.query == expected_query_entry.query
|
||
|
assert query_entry.timestamp == expected_query_entry.timestamp
|
||
|
assert query_entry.user == expected_query_entry.user
|
||
|
assert query_entry.operation_type == expected_query_entry.operation_type
|
||
|
assert query_entry.downstream_tables == expected_query_entry.downstream_tables
|
||
|
assert query_entry.upstream_tables == expected_query_entry.upstream_tables
|
||
|
assert query_entry.session_id == expected_query_entry.session_id
|