OpenMetadata/ingestion/tests/unit/readers/test_parquet_batches.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

314 lines
12 KiB
Python
Raw Normal View History

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tests for ParquetDataFrameReader _read_parquet_in_batches method
"""
import unittest
from unittest.mock import Mock, patch
import pandas as pd
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
LocalConfig,
)
from metadata.readers.dataframe.common import dataframe_to_chunks
from metadata.readers.dataframe.parquet import ParquetDataFrameReader
class TestParquetBatchReading(unittest.TestCase):
"""
Test the _read_parquet_in_batches method functionality
"""
def setUp(self):
"""Set up test fixtures"""
self.config_source = LocalConfig()
self.reader = ParquetDataFrameReader(self.config_source, None)
def test_successful_batch_reading_with_iter_batches(self):
"""Test successful batched reading when iter_batches is available"""
# Create mock parquet file with iter_batches capability
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Create sample data for two batches
batch1_data = pd.DataFrame(
{"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
)
batch2_data = pd.DataFrame(
{"id": [4, 5], "name": ["David", "Eve"], "age": [40, 45]}
)
# Create mock arrow table batches
mock_batch1 = Mock()
mock_batch1.to_pandas.return_value = batch1_data
mock_batch2 = Mock()
mock_batch2.to_pandas.return_value = batch2_data
mock_parquet_file.iter_batches.return_value = iter([mock_batch1, mock_batch2])
with patch(
"metadata.readers.dataframe.parquet.dataframe_to_chunks"
) as mock_chunks:
# Mock dataframe_to_chunks to return list of chunks per dataframe
mock_chunks.side_effect = lambda df: [df] if not df.empty else []
# Test the method
result = self.reader._read_parquet_in_batches(
mock_parquet_file, batch_size=1000
)
# Verify iter_batches was called with correct batch_size
mock_parquet_file.iter_batches.assert_called_once_with(batch_size=1000)
# Verify dataframe_to_chunks was called twice (once per batch)
self.assertEqual(mock_chunks.call_count, 2)
# Verify result contains chunks from both batches
self.assertEqual(len(result), 2)
pd.testing.assert_frame_equal(result[0], batch1_data)
pd.testing.assert_frame_equal(result[1], batch2_data)
def test_batch_reading_with_empty_batches(self):
"""Test that empty batches are properly skipped"""
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Create one non-empty and one empty batch
non_empty_batch = pd.DataFrame({"id": [1], "name": ["Alice"]})
empty_batch = pd.DataFrame()
mock_batch1 = Mock()
mock_batch1.to_pandas.return_value = non_empty_batch
mock_batch2 = Mock()
mock_batch2.to_pandas.return_value = empty_batch
mock_parquet_file.iter_batches.return_value = iter([mock_batch1, mock_batch2])
with patch(
"metadata.readers.dataframe.parquet.dataframe_to_chunks"
) as mock_chunks:
mock_chunks.side_effect = lambda df: [df] if not df.empty else []
result = self.reader._read_parquet_in_batches(mock_parquet_file)
# Should only call dataframe_to_chunks once for non-empty batch
mock_chunks.assert_called_once_with(non_empty_batch)
# Result should only contain the non-empty batch
self.assertEqual(len(result), 1)
pd.testing.assert_frame_equal(result[0], non_empty_batch)
def test_exception_handling_with_successful_fallback(self):
"""Test exception handling when batching fails but fallback succeeds"""
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Make iter_batches raise an exception
mock_parquet_file.iter_batches.side_effect = Exception("Batching failed")
# But make regular read work
sample_data = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
mock_arrow_table = Mock()
mock_arrow_table.to_pandas.return_value = sample_data
mock_parquet_file.read.return_value = mock_arrow_table
with patch(
"metadata.readers.dataframe.parquet.dataframe_to_chunks"
) as mock_chunks:
mock_chunks.return_value = [sample_data]
with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:
result = self.reader._read_parquet_in_batches(mock_parquet_file)
# Verify warning was logged about fallback
mock_logger.warning.assert_called_with(
"Batched reading failed: Batching failed. Falling back to regular reading - "
"this may cause memory issues for large files"
)
# Verify regular read was used as fallback
mock_parquet_file.read.assert_called_once()
# Verify result
self.assertEqual(result, [sample_data])
def test_exception_handling_with_failed_fallback(self):
"""Test when both batching and fallback fail"""
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Make both iter_batches and read fail
mock_parquet_file.iter_batches.side_effect = Exception("Batching failed")
mock_parquet_file.read.side_effect = Exception("Regular read failed")
with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:
# Should raise the fallback exception
with self.assertRaises(Exception) as context:
self.reader._read_parquet_in_batches(mock_parquet_file)
self.assertEqual(str(context.exception), "Regular read failed")
# Verify error was logged
mock_logger.error.assert_called_with(
"Failed to read parquet file: Regular read failed"
)
def test_custom_batch_size(self):
"""Test that custom batch size is properly passed to iter_batches"""
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Create a simple batch
batch_data = pd.DataFrame({"id": [1], "name": ["Alice"]})
mock_batch = Mock()
mock_batch.to_pandas.return_value = batch_data
mock_parquet_file.iter_batches.return_value = iter([mock_batch])
with patch(
"metadata.readers.dataframe.parquet.dataframe_to_chunks"
) as mock_chunks:
mock_chunks.return_value = [batch_data]
# Test with custom batch size
custom_batch_size = 5000
self.reader._read_parquet_in_batches(
mock_parquet_file, batch_size=custom_batch_size
)
# Verify custom batch size was used
mock_parquet_file.iter_batches.assert_called_once_with(
batch_size=custom_batch_size
)
def test_batch_counting_and_logging(self):
"""Test that batch counting and logging work correctly"""
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Create multiple batches
batches_data = [
pd.DataFrame({"id": [1], "name": ["Alice"]}),
pd.DataFrame({"id": [2], "name": ["Bob"]}),
pd.DataFrame({"id": [3], "name": ["Charlie"]}),
]
mock_batches = []
for data in batches_data:
mock_batch = Mock()
mock_batch.to_pandas.return_value = data
mock_batches.append(mock_batch)
mock_parquet_file.iter_batches.return_value = iter(mock_batches)
with patch(
"metadata.readers.dataframe.parquet.dataframe_to_chunks"
) as mock_chunks:
mock_chunks.side_effect = lambda df: [df] if not df.empty else []
with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:
result = self.reader._read_parquet_in_batches(mock_parquet_file)
# Verify info logs were called
mock_logger.info.assert_any_call(
"Reading large parquet file in batches to avoid memory issues"
)
mock_logger.info.assert_any_call(
"Successfully processed 3 batches from large parquet file"
)
# Verify all batches were processed
self.assertEqual(len(result), 3)
def test_dataframe_to_chunks_integration(self):
"""Test integration with dataframe_to_chunks function"""
mock_parquet_file = Mock()
mock_parquet_file.iter_batches = Mock()
# Create a batch that would generate multiple chunks
batch_data = pd.DataFrame(
{
"id": list(
range(1000)
), # Large enough to potentially create multiple chunks
"name": [f"User{i}" for i in range(1000)],
}
)
mock_batch = Mock()
mock_batch.to_pandas.return_value = batch_data
mock_parquet_file.iter_batches.return_value = iter([mock_batch])
# Use real dataframe_to_chunks function
result = self.reader._read_parquet_in_batches(mock_parquet_file)
# Verify that result is a list of dataframes (chunks)
self.assertIsInstance(result, list)
self.assertTrue(len(result) > 0)
# Verify that all chunks are DataFrames
for chunk in result:
self.assertIsInstance(chunk, pd.DataFrame)
# Verify that concatenating all chunks gives us back the original data
if len(result) > 1:
concatenated = pd.concat(result, ignore_index=True)
pd.testing.assert_frame_equal(concatenated, batch_data)
else:
pd.testing.assert_frame_equal(result[0], batch_data)
def test_real_parquet_file_batch_processing(self):
"""Test batch processing with real flights-1m.parquet file (7MB)"""
import os
from pyarrow.parquet import ParquetFile
# Path to the test parquet file
test_file_path = os.path.join(
os.path.dirname(__file__), "test_files", "flights-1m.parquet"
)
# Skip test if file doesn't exist
if not os.path.exists(test_file_path):
self.skipTest(f"Test file not found: {test_file_path}")
try:
# Create ParquetFile object from real file
parquet_file = ParquetFile(test_file_path)
# Get some basic info about the file
file_size = os.path.getsize(test_file_path)
total_rows = parquet_file.metadata.num_rows
print(
f"Testing with real parquet file: {file_size} bytes, {total_rows} rows"
)
result = self.reader._read_parquet_in_batches(parquet_file)
fallback_method_result = dataframe_to_chunks(
parquet_file.read().to_pandas()
)
result_processed_rows = sum(len(chunk) for chunk in result)
fallback_method_processed_rows = sum(
len(chunk) for chunk in fallback_method_result
)
self.assertEqual(result_processed_rows, fallback_method_processed_rows)
except Exception as e:
self.fail(f"Real parquet file test failed: {e}")
if __name__ == "__main__":
unittest.main()