datahub/metadata-ingestion/tests/unit/dremio/test_dremio_api_pagination.py

282 lines
11 KiB
Python

from collections import deque
from unittest.mock import Mock, patch
import pytest
from datahub.ingestion.source.dremio.dremio_api import (
DremioAPIException,
DremioAPIOperations,
DremioEdition,
)
from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
class TestDremioAPIPagination:
@pytest.fixture
def dremio_api(self, monkeypatch):
"""Setup mock Dremio API for testing"""
# Mock the requests.Session
mock_session = Mock()
monkeypatch.setattr("requests.Session", Mock(return_value=mock_session))
# Mock the authentication response
mock_session.post.return_value.json.return_value = {"token": "dummy-token"}
mock_session.post.return_value.status_code = 200
config = DremioSourceConfig(
hostname="dummy-host",
port=9047,
tls=False,
authentication_method="password",
username="dummy-user",
password="dummy-password",
)
report = DremioSourceReport()
api = DremioAPIOperations(config, report)
api.session = mock_session
return api
def test_fetch_all_results_missing_rows_key(self, dremio_api):
"""Test handling of API response missing 'rows' key"""
# Mock get_job_result to return response without 'rows' key
dremio_api.get_job_result = Mock(return_value={"message": "No data available"})
result = dremio_api._fetch_all_results("test-job-id")
# Should return empty list when no rows key is present
assert result == []
dremio_api.get_job_result.assert_called_once()
def test_fetch_all_results_with_error_message(self, dremio_api):
"""Test handling of API response with errorMessage"""
# Mock get_job_result to return error response
dremio_api.get_job_result = Mock(return_value={"errorMessage": "Out of memory"})
with pytest.raises(DremioAPIException, match="Query error: Out of memory"):
dremio_api._fetch_all_results("test-job-id")
def test_fetch_all_results_empty_rows(self, dremio_api):
"""Test handling of empty rows response"""
# Mock get_job_result to return empty rows
dremio_api.get_job_result = Mock(return_value={"rows": [], "rowCount": 0})
result = dremio_api._fetch_all_results("test-job-id")
assert result == []
dremio_api.get_job_result.assert_called_once()
def test_fetch_all_results_normal_case(self, dremio_api):
"""Test normal operation with valid rows"""
# Mock get_job_result to return valid data
# First response: 2 rows, rowCount=2 (offset will be 2, which equals rowCount, so stops)
mock_responses = [
{"rows": [{"col1": "val1"}, {"col1": "val2"}], "rowCount": 2},
]
dremio_api.get_job_result = Mock(side_effect=mock_responses)
result = dremio_api._fetch_all_results("test-job-id")
expected = [{"col1": "val1"}, {"col1": "val2"}]
assert result == expected
assert dremio_api.get_job_result.call_count == 1
def test_fetch_results_iter_missing_rows_key(self, dremio_api):
"""Test internal streaming method handling missing 'rows' key"""
# Mock get_job_result to return response without 'rows' key
dremio_api.get_job_result = Mock(return_value={"message": "No data available"})
result_iterator = dremio_api._fetch_results_iter("test-job-id")
results = list(result_iterator)
# Should return empty iterator when no rows key is present
assert results == []
def test_fetch_results_iter_with_error(self, dremio_api):
"""Test internal streaming method handling error response"""
# Mock get_job_result to return error response
dremio_api.get_job_result = Mock(return_value={"errorMessage": "Query timeout"})
result_iterator = dremio_api._fetch_results_iter("test-job-id")
with pytest.raises(DremioAPIException, match="Query error: Query timeout"):
list(result_iterator)
def test_fetch_results_iter_normal_case(self, dremio_api):
"""Test internal streaming method with valid data"""
# Mock get_job_result to return valid data in batches
mock_responses = [
{"rows": [{"col1": "val1"}, {"col1": "val2"}], "rowCount": 2},
{"rows": [], "rowCount": 2}, # Empty response to end iteration
]
dremio_api.get_job_result = Mock(side_effect=mock_responses)
result_iterator = dremio_api._fetch_results_iter("test-job-id")
results = list(result_iterator)
expected = [
{"col1": "val1"},
{"col1": "val2"},
]
assert results == expected
def test_execute_query_iter_success(self, dremio_api):
"""Test execute_query_iter with successful job completion"""
# Mock the POST response for job submission
dremio_api.post = Mock(return_value={"id": "job-123"})
# Mock job status progression
status_responses = [
{"jobState": "RUNNING"},
{"jobState": "RUNNING"},
{"jobState": "COMPLETED"},
]
dremio_api.get_job_status = Mock(side_effect=status_responses)
# Mock streaming results
dremio_api._fetch_results_iter = Mock(return_value=iter([{"col1": "val1"}]))
with patch("time.sleep"): # Skip actual sleep delays
result_iterator = dremio_api.execute_query_iter("SELECT * FROM test")
results = list(result_iterator)
assert results == [{"col1": "val1"}]
dremio_api.post.assert_called_once()
assert dremio_api.get_job_status.call_count == 3
def test_execute_query_iter_job_failure(self, dremio_api):
"""Test execute_query_iter with job failure"""
# Mock the POST response for job submission
dremio_api.post = Mock(return_value={"id": "job-123"})
# Mock job failure
dremio_api.get_job_status = Mock(
return_value={"jobState": "FAILED", "errorMessage": "SQL syntax error"}
)
with (
pytest.raises(RuntimeError, match="Query failed: SQL syntax error"),
patch("time.sleep"),
):
result_iterator = dremio_api.execute_query_iter("SELECT * FROM test")
list(result_iterator) # Force evaluation
def test_execute_query_iter_timeout(self, dremio_api):
"""Test execute_query_iter with timeout"""
# Mock the POST response for job submission
dremio_api.post = Mock(return_value={"id": "job-123"})
# Mock job that never completes
dremio_api.get_job_status = Mock(return_value={"jobState": "RUNNING"})
dremio_api.cancel_query = Mock()
# Mock time.time to simulate timeout - need to patch where it's imported
# First call: start_time = 0
# Second call: time() - start_time check = 3700 (triggers timeout)
mock_time_values = [0, 3700]
with (
patch(
"datahub.ingestion.source.dremio.dremio_api.time",
side_effect=mock_time_values,
),
patch("datahub.ingestion.source.dremio.dremio_api.sleep"),
pytest.raises(
DremioAPIException,
match="Query execution timed out after 3600 seconds",
),
):
result_iterator = dremio_api.execute_query_iter("SELECT * FROM test")
list(result_iterator)
dremio_api.cancel_query.assert_called_once_with("job-123")
def test_get_all_tables_and_columns(self, dremio_api):
"""Test streaming version of get_all_tables_and_columns"""
from datahub.ingestion.source.dremio.dremio_api import DremioEdition
# Set up test data
dremio_api.edition = DremioEdition.ENTERPRISE
dremio_api.allow_schema_pattern = [".*"]
dremio_api.deny_schema_pattern = []
# Mock container
mock_container = Mock()
mock_container.container_name = "test_source"
containers = deque([mock_container])
# Mock streaming query results
mock_results = [
{
"COLUMN_NAME": "col1",
"FULL_TABLE_PATH": "test.table1",
"TABLE_NAME": "table1",
"TABLE_SCHEMA": "test",
"ORDINAL_POSITION": 1,
"IS_NULLABLE": "YES",
"DATA_TYPE": "VARCHAR",
"COLUMN_SIZE": 255,
"RESOURCE_ID": "res1",
}
]
dremio_api.execute_query_iter = Mock(return_value=iter(mock_results))
# Test streaming method
result_iterator = dremio_api.get_all_tables_and_columns(containers)
tables = list(result_iterator)
assert len(tables) == 1
table = tables[0]
assert table["TABLE_NAME"] == "table1"
assert table["TABLE_SCHEMA"] == "test"
assert len(table["COLUMNS"]) == 1
assert table["COLUMNS"][0]["name"] == "col1"
def test_extract_all_queries(self, dremio_api):
"""Test streaming version of extract_all_queries"""
dremio_api.edition = DremioEdition.ENTERPRISE
dremio_api.start_time = None
dremio_api.end_time = None
# Mock streaming query execution
mock_query_results = [
{"query_id": "q1", "sql": "SELECT * FROM table1"},
{"query_id": "q2", "sql": "SELECT * FROM table2"},
]
dremio_api.execute_query_iter = Mock(return_value=iter(mock_query_results))
result_iterator = dremio_api.extract_all_queries()
queries = list(result_iterator)
assert len(queries) == 2
assert queries[0]["query_id"] == "q1"
assert queries[1]["query_id"] == "q2"
dremio_api.execute_query_iter.assert_called_once()
def test_fetch_results_iter_incremental_yielding(self, dremio_api):
"""Test that internal iterator yields results incrementally and can be partially consumed"""
# Verify that streaming yields results one at a time and iterator state is maintained
mock_responses = [
{"rows": [{"id": i} for i in range(100)], "rowCount": 1000},
{"rows": [{"id": i} for i in range(100, 200)], "rowCount": 1000},
{"rows": [], "rowCount": 1000}, # End iteration
]
dremio_api.get_job_result = Mock(side_effect=mock_responses)
# Test that streaming version yields results incrementally
result_iterator = dremio_api._fetch_results_iter("test-job")
# Get first few results
first_batch = []
for i, result in enumerate(result_iterator):
first_batch.append(result)
if i >= 50: # Stop after getting 51 results
break
assert len(first_batch) == 51
# Verify we can still get more results (iterator not exhausted)
next_result = next(result_iterator, None)
assert next_result is not None