mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
308 lines
11 KiB
Python
308 lines
11 KiB
Python
from datetime import datetime
|
|
from typing import Any, Dict, List
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from datahub.ingestion.source.datahub.datahub_database_reader import (
|
|
DATETIME_FORMAT,
|
|
DataHubDatabaseReader,
|
|
VersionOrderer,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def rows():
|
|
return [
|
|
{"createdon": 0, "version": 0, "urn": "one"},
|
|
{"createdon": 0, "version": 1, "urn": "one"},
|
|
{"createdon": 0, "version": 0, "urn": "two"},
|
|
{"createdon": 0, "version": 0, "urn": "three"},
|
|
{"createdon": 0, "version": 1, "urn": "three"},
|
|
{"createdon": 0, "version": 2, "urn": "three"},
|
|
{"createdon": 0, "version": 1, "urn": "two"},
|
|
{"createdon": 0, "version": 4, "urn": "three"},
|
|
{"createdon": 0, "version": 5, "urn": "three"},
|
|
{"createdon": 1, "version": 6, "urn": "three"},
|
|
{"createdon": 1, "version": 0, "urn": "four"},
|
|
{"createdon": 2, "version": 0, "urn": "five"},
|
|
{"createdon": 2, "version": 1, "urn": "six"},
|
|
{"createdon": 2, "version": 0, "urn": "six"},
|
|
{"createdon": 3, "version": 0, "urn": "seven"},
|
|
{"createdon": 3, "version": 0, "urn": "eight"},
|
|
]
|
|
|
|
|
|
def test_version_orderer(rows):
|
|
orderer = VersionOrderer[Dict[str, Any]](enabled=True)
|
|
ordered_rows = list(orderer(rows))
|
|
assert ordered_rows == sorted(
|
|
ordered_rows, key=lambda x: (x["createdon"], x["version"] == 0)
|
|
)
|
|
|
|
|
|
def test_version_orderer_disabled(rows):
|
|
orderer = VersionOrderer[Dict[str, Any]](enabled=False)
|
|
ordered_rows = list(orderer(rows))
|
|
assert ordered_rows == rows
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_reader():
|
|
with patch(
|
|
"datahub.ingestion.source.datahub.datahub_database_reader.create_engine"
|
|
) as mock_create_engine:
|
|
config = MagicMock()
|
|
connection_config = MagicMock()
|
|
report = MagicMock()
|
|
mock_engine = MagicMock()
|
|
mock_dialect = MagicMock()
|
|
mock_identifier_preparer = MagicMock()
|
|
mock_dialect.identifier_preparer = mock_identifier_preparer
|
|
mock_identifier_preparer.quote = lambda x: f'"{x}"'
|
|
mock_engine.dialect = mock_dialect
|
|
mock_create_engine.return_value = mock_engine
|
|
reader = DataHubDatabaseReader(config, connection_config, report)
|
|
reader.query = MagicMock(side_effect=reader.query) # type: ignore
|
|
reader.execute_server_cursor = MagicMock() # type: ignore
|
|
return reader
|
|
|
|
|
|
def test_get_rows_for_date_range_no_rows(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
mock_reader.execute_server_cursor.return_value = []
|
|
|
|
# Execute
|
|
result = list(mock_reader._get_rows(start_date, end_date, False, 50))
|
|
|
|
# Assert
|
|
assert len(result) == 0
|
|
mock_reader.query.assert_called_once_with(False)
|
|
mock_reader.execute_server_cursor.assert_called_once()
|
|
|
|
|
|
def test_get_rows_for_date_range_with_rows(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
mock_rows = [
|
|
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 13, 0)},
|
|
]
|
|
mock_reader.execute_server_cursor.return_value = mock_rows
|
|
|
|
# Execute
|
|
result = list(mock_reader._get_rows(start_date, end_date, False, 50))
|
|
|
|
# Assert
|
|
assert result == mock_rows
|
|
mock_reader.query.assert_called_once_with(False)
|
|
assert mock_reader.execute_server_cursor.call_count == 1
|
|
|
|
|
|
def test_get_rows_for_date_range_pagination_same_timestamp(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
batch1 = [
|
|
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
]
|
|
batch2 = [
|
|
{"urn": "urn3", "metadata": "data3", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
]
|
|
batch3: List[Dict] = []
|
|
|
|
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3]
|
|
|
|
# Execute
|
|
result = list(mock_reader._get_rows(start_date, end_date, False, 2))
|
|
|
|
# Assert
|
|
assert len(result) == 3
|
|
assert result[0]["urn"] == "urn1"
|
|
assert result[1]["urn"] == "urn2"
|
|
assert result[2]["urn"] == "urn3"
|
|
assert mock_reader.execute_server_cursor.call_count == 2
|
|
|
|
|
|
def test_get_rows_for_date_range_pagination_different_timestamp(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
batch1 = [
|
|
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 13, 0)},
|
|
]
|
|
batch2 = [
|
|
{"urn": "urn3", "metadata": "data3", "createdon": datetime(2023, 1, 1, 14, 0)},
|
|
]
|
|
batch3: List[Dict] = []
|
|
|
|
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3]
|
|
|
|
# Execute
|
|
result = list(mock_reader._get_rows(start_date, end_date, False, 2))
|
|
|
|
# Assert
|
|
assert len(result) == 3
|
|
assert result[0]["urn"] == "urn1"
|
|
assert result[1]["urn"] == "urn2"
|
|
assert result[2]["urn"] == "urn3"
|
|
assert mock_reader.execute_server_cursor.call_count == 2
|
|
|
|
|
|
def test_get_rows_for_date_range_duplicate_data_handling(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
batch1 = [
|
|
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
]
|
|
batch2 = [
|
|
{"urn": "urn2", "metadata": "data2", "createdon": datetime(2023, 1, 1, 13, 0)},
|
|
]
|
|
batch3: List[Dict] = []
|
|
|
|
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3]
|
|
|
|
# Execute
|
|
result = list(mock_reader._get_rows(start_date, end_date, False, 1))
|
|
|
|
# Assert
|
|
assert len(result) == 2
|
|
assert result[0]["urn"] == "urn1"
|
|
assert result[1]["urn"] == "urn2"
|
|
|
|
# Check call parameters for each iteration
|
|
calls = mock_reader.execute_server_cursor.call_args_list
|
|
assert len(calls) == 3
|
|
|
|
# First call: initial parameters
|
|
first_call_params = calls[0][0][1]
|
|
assert first_call_params["since_createdon"] == start_date.strftime(DATETIME_FORMAT)
|
|
assert first_call_params["end_createdon"] == end_date.strftime(DATETIME_FORMAT)
|
|
assert first_call_params["limit"] == 1
|
|
assert first_call_params["offset"] == 0
|
|
|
|
# Second call: duplicate detected, same createdon so offset increased
|
|
second_call_params = calls[1][0][1]
|
|
assert second_call_params["offset"] == 1
|
|
assert second_call_params["since_createdon"] == datetime(
|
|
2023, 1, 1, 12, 0
|
|
).strftime(DATETIME_FORMAT)
|
|
|
|
# Third call: successful fetch after duplicate with new timestamp
|
|
third_call_params = calls[2][0][1]
|
|
# After a duplicate with no last_createdon, offset should increase
|
|
assert third_call_params["offset"] == 0
|
|
|
|
|
|
def test_get_rows_multiple_paging(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
batch1 = [
|
|
{"urn": "urn1", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn2", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn3", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
]
|
|
batch2 = [
|
|
{"urn": "urn4", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn5", "metadata": "data1", "createdon": datetime(2023, 1, 1, 12, 0)},
|
|
{"urn": "urn6", "metadata": "data1", "createdon": datetime(2023, 1, 1, 13, 0)},
|
|
]
|
|
batch3 = [
|
|
{"urn": "urn7", "metadata": "data1", "createdon": datetime(2023, 1, 1, 14, 0)},
|
|
{"urn": "urn8", "metadata": "data1", "createdon": datetime(2023, 1, 1, 14, 0)},
|
|
{"urn": "urn9", "metadata": "data1", "createdon": datetime(2023, 1, 1, 15, 0)},
|
|
]
|
|
batch4 = [
|
|
{"urn": "urn10", "metadata": "data1", "createdon": datetime(2023, 1, 1, 16, 0)},
|
|
]
|
|
|
|
mock_reader.execute_server_cursor.side_effect = [batch1, batch2, batch3, batch4]
|
|
|
|
# Execute
|
|
result = list(mock_reader._get_rows(start_date, end_date, False, 3))
|
|
|
|
# Assert
|
|
# In this case duplicate items are expected
|
|
assert len(result) == 10
|
|
assert result[0]["urn"] == "urn1"
|
|
assert result[1]["urn"] == "urn2"
|
|
assert result[2]["urn"] == "urn3"
|
|
assert result[3]["urn"] == "urn4"
|
|
assert result[4]["urn"] == "urn5"
|
|
assert result[5]["urn"] == "urn6"
|
|
assert result[6]["urn"] == "urn7"
|
|
assert result[7]["urn"] == "urn8"
|
|
assert result[8]["urn"] == "urn9"
|
|
assert result[9]["urn"] == "urn10"
|
|
|
|
# Check call parameters for each iteration
|
|
calls = mock_reader.execute_server_cursor.call_args_list
|
|
assert len(calls) == 4
|
|
|
|
# First call: initial parameters
|
|
first_call_params = calls[0][0][1]
|
|
assert first_call_params["since_createdon"] == start_date.strftime(DATETIME_FORMAT)
|
|
assert first_call_params["end_createdon"] == end_date.strftime(DATETIME_FORMAT)
|
|
assert first_call_params["limit"] == 3
|
|
assert first_call_params["offset"] == 0
|
|
|
|
# Second call: duplicate detected, same createdon so offset increased
|
|
second_call_params = calls[1][0][1]
|
|
assert second_call_params["offset"] == 3
|
|
assert second_call_params["limit"] == 3
|
|
assert second_call_params["since_createdon"] == datetime(
|
|
2023, 1, 1, 12, 0
|
|
).strftime(DATETIME_FORMAT)
|
|
assert first_call_params["end_createdon"] == end_date.strftime(DATETIME_FORMAT)
|
|
|
|
# Third call: successful fetch after duplicate with new timestamp
|
|
third_call_params = calls[2][0][1]
|
|
# After a duplicate with no last_createdon, offset should increase
|
|
assert third_call_params["offset"] == 0
|
|
assert third_call_params["since_createdon"] == datetime(2023, 1, 1, 13, 0).strftime(
|
|
DATETIME_FORMAT
|
|
)
|
|
|
|
# Third call: successful fetch after duplicate with new timestamp
|
|
fourth_call_params = calls[3][0][1]
|
|
# After a duplicate with no last_createdon, offset should increase
|
|
assert fourth_call_params["offset"] == 0
|
|
assert fourth_call_params["since_createdon"] == datetime(
|
|
2023, 1, 1, 15, 0
|
|
).strftime(DATETIME_FORMAT)
|
|
assert fourth_call_params["limit"] == 3
|
|
|
|
|
|
def test_get_rows_for_date_range_exception_handling(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
mock_reader.execute_server_cursor.side_effect = Exception("Test exception")
|
|
|
|
# Execute and Assert
|
|
with pytest.raises(Exception, match="Test exception"):
|
|
list(mock_reader._get_rows(start_date, end_date, False, 50))
|
|
|
|
|
|
def test_get_rows_for_date_range_exclude_aspects(mock_reader):
|
|
# Setup
|
|
start_date = datetime(2023, 1, 1)
|
|
end_date = datetime(2023, 1, 2)
|
|
mock_reader.config.exclude_aspects = ["aspect1", "aspect2"]
|
|
mock_reader.execute_server_cursor.return_value = []
|
|
|
|
# Execute
|
|
list(mock_reader._get_rows(start_date, end_date, False, 50))
|
|
|
|
# Assert
|
|
called_params = mock_reader.execute_server_cursor.call_args[0][1]
|
|
assert "exclude_aspects" in called_params
|
|
assert called_params["exclude_aspects"] == ("aspect1", "aspect2")
|