datahub/metadata-ingestion/tests/unit/test_datahub_source.py
2025-05-26 14:05:17 +02:00

308 lines
11 KiB
Python

from datetime import datetime
from typing import Any, Dict, List
from unittest.mock import MagicMock, patch
import pytest
from datahub.ingestion.source.datahub.datahub_database_reader import (
DATETIME_FORMAT,
DataHubDatabaseReader,
VersionOrderer,
)
@pytest.fixture
def rows():
return [
{"createdon": 0, "version": 0, "urn": "one"},
{"createdon": 0, "version": 1, "urn": "one"},
{"createdon": 0, "version": 0, "urn": "two"},
{"createdon": 0, "version": 0, "urn": "three"},
{"createdon": 0, "version": 1, "urn": "three"},
{"createdon": 0, "version": 2, "urn": "three"},
{"createdon": 0, "version": 1, "urn": "two"},
{"createdon": 0, "version": 4, "urn": "three"},
{"createdon": 0, "version": 5, "urn": "three"},
{"createdon": 1, "version": 6, "urn": "three"},
{"createdon": 1, "version": 0, "urn": "four"},
{"createdon": 2, "version": 0, "urn": "five"},
{"createdon": 2, "version": 1, "urn": "six"},
{"createdon": 2, "version": 0, "urn": "six"},
{"createdon": 3, "version": 0, "urn": "seven"},
{"createdon": 3, "version": 0, "urn": "eight"},
]
def test_version_orderer(rows):
orderer = VersionOrderer[Dict[str, Any]](enabled=True)
ordered_rows = list(orderer(rows))
assert ordered_rows == sorted(
ordered_rows, key=lambda x: (x["createdon"], x["version"] == 0)
)
def test_version_orderer_disabled(rows):
orderer = VersionOrderer[Dict[str, Any]](enabled=False)
ordered_rows = list(orderer(rows))
assert ordered_rows == rows
@pytest.fixture
def mock_reader():
with patch(
"datahub.ingestion.source.datahub.datahub_database_reader.create_engine"
) as mock_create_engine:
config = MagicMock()
connection_config = MagicMock()
report = MagicMock()
mock_engine = MagicMock()
mock_dialect = MagicMock()
mock_identifier_preparer = MagicMock()
mock_dialect.identifier_preparer = mock_identifier_preparer
mock_identifier_preparer.quote = lambda x: f'"{x}"'
mock_engine.dialect = mock_dialect
mock_create_engine.return_value = mock_engine
reader = DataHubDatabaseReader(config, connection_config, report)
reader.query = MagicMock(side_effect=reader.query) # type: ignore
reader.execute_server_cursor = MagicMock() # type: ignore
return reader
def test_get_rows_for_date_range_no_rows(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
mock_reader.execute_server_cursor.return_value = []
# Execute
result = list(mock_reader._get_rows(start_date, end_date, False, 50))
# Assert
assert len(result) == 0
mock_reader.query.assert_called_once_with(False)
mock_reader.execute_server_cursor.assert_called_once()
def test_get_rows_for_date_range_with_rows(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
mock_rows = [
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 13, 0)},
]
mock_reader.execute_server_cursor.return_value = mock_rows
# Execute
result = list(mock_reader._get_rows(start_date, end_date, False, 50))
# Assert
assert result == mock_rows
mock_reader.query.assert_called_once_with(False)
assert mock_reader.execute_server_cursor.call_count == 1
def test_get_rows_for_date_range_pagination_same_timestamp(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
batch1 = [
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 12, 0)},
]
batch2 = [
{"urn": "urn3", "metadata": "data3", "createdon": datetime(2023, 1, 1, 12, 0)},
]
batch3: List[Dict] = []
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3]
# Execute
result = list(mock_reader._get_rows(start_date, end_date, False, 2))
# Assert
assert len(result) == 3
assert result[0]["urn"] == "urn1"
assert result[1]["urn"] == "urn2"
assert result[2]["urn"] == "urn3"
assert mock_reader.execute_server_cursor.call_count == 2
def test_get_rows_for_date_range_pagination_different_timestamp(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
batch1 = [
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 13, 0)},
]
batch2 = [
{"urn": "urn3", "metadata": "data3", "createdon": datetime(2023, 1, 1, 14, 0)},
]
batch3: List[Dict] = []
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3]
# Execute
result = list(mock_reader._get_rows(start_date, end_date, False, 2))
# Assert
assert len(result) == 3
assert result[0]["urn"] == "urn1"
assert result[1]["urn"] == "urn2"
assert result[2]["urn"] == "urn3"
assert mock_reader.execute_server_cursor.call_count == 2
def test_get_rows_for_date_range_duplicate_data_handling(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
batch1 = [
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
]
batch2 = [
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 13, 0)},
]
batch3: List[Dict] = []
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3]
# Execute
result = list(mock_reader._get_rows(start_date, end_date, False, 1))
# Assert
assert len(result) == 2
assert result[0]["urn"] == "urn1"
assert result[1]["urn"] == "urn2"
# Check call parameters for each iteration
calls = mock_reader.execute_server_cursor.call_args_list
assert len(calls) == 3
# First call: initial parameters
first_call_params = calls[0][0][1]
assert first_call_params["since_createdon"] == start_date.strftime(DATETIME_FORMAT)
assert first_call_params["end_createdon"] == end_date.strftime(DATETIME_FORMAT)
assert first_call_params["limit"] == 1
assert first_call_params["offset"] == 0
# Second call: duplicate detected, same createdon so offset increased
second_call_params = calls[1][0][1]
assert second_call_params["offset"] == 1
assert second_call_params["since_createdon"] == datetime(
2023, 1, 1, 12, 0
).strftime(DATETIME_FORMAT)
# Third call: successful fetch after duplicate with new timestamp
third_call_params = calls[2][0][1]
# After a duplicate with no last_createdon, offset should increase
assert third_call_params["offset"] == 0
def test_get_rows_multiple_paging(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
batch1 = [
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn2", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn3", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
]
batch2 = [
{"urn": "urn4", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn5", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
{"urn": "urn6", "metadata": "data1", "createdon": datetime(2023, 1, 1, 13, 0)},
]
batch3 = [
{"urn": "urn7", "metadata": "data1", "createdon": datetime(2023, 1, 1, 14, 0)},
{"urn": "urn8", "metadata": "data1", "createdon": datetime(2023, 1, 1, 14, 0)},
{"urn": "urn9", "metadata": "data1", "createdon": datetime(2023, 1, 1, 15, 0)},
]
batch4 = [
{"urn": "urn10", "metadata": "data1", "createdon": datetime(2023, 1, 1, 16, 0)},
]
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3, batch4]
# Execute
result = list(mock_reader._get_rows(start_date, end_date, False, 3))
# Assert
# In this case duplicate items are expected
assert len(result) == 10
assert result[0]["urn"] == "urn1"
assert result[1]["urn"] == "urn2"
assert result[2]["urn"] == "urn3"
assert result[3]["urn"] == "urn4"
assert result[4]["urn"] == "urn5"
assert result[5]["urn"] == "urn6"
assert result[6]["urn"] == "urn7"
assert result[7]["urn"] == "urn8"
assert result[8]["urn"] == "urn9"
assert result[9]["urn"] == "urn10"
# Check call parameters for each iteration
calls = mock_reader.execute_server_cursor.call_args_list
assert len(calls) == 4
# First call: initial parameters
first_call_params = calls[0][0][1]
assert first_call_params["since_createdon"] == start_date.strftime(DATETIME_FORMAT)
assert first_call_params["end_createdon"] == end_date.strftime(DATETIME_FORMAT)
assert first_call_params["limit"] == 3
assert first_call_params["offset"] == 0
# Second call: duplicate detected, same createdon so offset increased
second_call_params = calls[1][0][1]
assert second_call_params["offset"] == 3
assert second_call_params["limit"] == 3
assert second_call_params["since_createdon"] == datetime(
2023, 1, 1, 12, 0
).strftime(DATETIME_FORMAT)
assert first_call_params["end_createdon"] == end_date.strftime(DATETIME_FORMAT)
# Third call: successful fetch after duplicate with new timestamp
third_call_params = calls[2][0][1]
# After a duplicate with no last_createdon, offset should increase
assert third_call_params["offset"] == 0
assert third_call_params["since_createdon"] == datetime(2023, 1, 1, 13, 0).strftime(
DATETIME_FORMAT
)
# Third call: successful fetch after duplicate with new timestamp
fourth_call_params = calls[3][0][1]
# After a duplicate with no last_createdon, offset should increase
assert fourth_call_params["offset"] == 0
assert fourth_call_params["since_createdon"] == datetime(
2023, 1, 1, 15, 0
).strftime(DATETIME_FORMAT)
assert fourth_call_params["limit"] == 3
def test_get_rows_for_date_range_exception_handling(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
mock_reader.execute_server_cursor.side_effect = Exception("Test exception")
# Execute and Assert
with pytest.raises(Exception, match="Test exception"):
list(mock_reader._get_rows(start_date, end_date, False, 50))
def test_get_rows_for_date_range_exclude_aspects(mock_reader):
# Setup
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 1, 2)
mock_reader.config.exclude_aspects = ["aspect1", "aspect2"]
mock_reader.execute_server_cursor.return_value = []
# Execute
list(mock_reader._get_rows(start_date, end_date, False, 50))
# Assert
called_params = mock_reader.execute_server_cursor.call_args[0][1]
assert "exclude_aspects" in called_params
assert called_params["exclude_aspects"] == ("aspect1", "aspect2")