OpenMetadata/ingestion/tests/unit/readers/test_parquet_batches.py

#  Copyright 2025 Collate
#  Licensed under the Collate Community License, Version 1.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Tests for ParquetDataFrameReader _read_parquet_in_batches method
"""
import unittest
from unittest.mock import Mock, patch

import pandas as pd

from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
    LocalConfig,
)
from metadata.readers.dataframe.common import dataframe_to_chunks
from metadata.readers.dataframe.parquet import ParquetDataFrameReader


class TestParquetBatchReading(unittest.TestCase):
    """
    Test the _read_parquet_in_batches method functionality
    """

    def setUp(self):
        """Set up test fixtures"""
        self.config_source = LocalConfig()
        self.reader = ParquetDataFrameReader(self.config_source, None)

    def test_successful_batch_reading_with_iter_batches(self):
        """Test successful batched reading when iter_batches is available"""
        # Create mock parquet file with iter_batches capability
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Create sample data for two batches
        batch1_data = pd.DataFrame(
            {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
        )
        batch2_data = pd.DataFrame(
            {"id": [4, 5], "name": ["David", "Eve"], "age": [40, 45]}
        )

        # Create mock arrow table batches
        mock_batch1 = Mock()
        mock_batch1.to_pandas.return_value = batch1_data
        mock_batch2 = Mock()
        mock_batch2.to_pandas.return_value = batch2_data

        mock_parquet_file.iter_batches.return_value = iter([mock_batch1, mock_batch2])

        with patch(
            "metadata.readers.dataframe.parquet.dataframe_to_chunks"
        ) as mock_chunks:
            # Mock dataframe_to_chunks to return list of chunks per dataframe
            mock_chunks.side_effect = lambda df: [df] if not df.empty else []

            # Test the method
            result = self.reader._read_parquet_in_batches(
                mock_parquet_file, batch_size=1000
            )

            # Verify iter_batches was called with correct batch_size
            mock_parquet_file.iter_batches.assert_called_once_with(batch_size=1000)

            # Verify dataframe_to_chunks was called twice (once per batch)
            self.assertEqual(mock_chunks.call_count, 2)

            # Verify result contains chunks from both batches
            self.assertEqual(len(result), 2)
            pd.testing.assert_frame_equal(result[0], batch1_data)
            pd.testing.assert_frame_equal(result[1], batch2_data)

    def test_batch_reading_with_empty_batches(self):
        """Test that empty batches are properly skipped"""
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Create one non-empty and one empty batch
        non_empty_batch = pd.DataFrame({"id": [1], "name": ["Alice"]})
        empty_batch = pd.DataFrame()

        mock_batch1 = Mock()
        mock_batch1.to_pandas.return_value = non_empty_batch
        mock_batch2 = Mock()
        mock_batch2.to_pandas.return_value = empty_batch

        mock_parquet_file.iter_batches.return_value = iter([mock_batch1, mock_batch2])

        with patch(
            "metadata.readers.dataframe.parquet.dataframe_to_chunks"
        ) as mock_chunks:
            mock_chunks.side_effect = lambda df: [df] if not df.empty else []

            result = self.reader._read_parquet_in_batches(mock_parquet_file)

            # Should only call dataframe_to_chunks once for non-empty batch
            mock_chunks.assert_called_once_with(non_empty_batch)

            # Result should only contain the non-empty batch
            self.assertEqual(len(result), 1)
            pd.testing.assert_frame_equal(result[0], non_empty_batch)

    def test_exception_handling_with_successful_fallback(self):
        """Test exception handling when batching fails but fallback succeeds"""
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Make iter_batches raise an exception
        mock_parquet_file.iter_batches.side_effect = Exception("Batching failed")

        # But make regular read work
        sample_data = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})
        mock_arrow_table = Mock()
        mock_arrow_table.to_pandas.return_value = sample_data
        mock_parquet_file.read.return_value = mock_arrow_table

        with patch(
            "metadata.readers.dataframe.parquet.dataframe_to_chunks"
        ) as mock_chunks:
            mock_chunks.return_value = [sample_data]

            with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:
                result = self.reader._read_parquet_in_batches(mock_parquet_file)

                # Verify warning was logged about fallback
                mock_logger.warning.assert_called_with(
                    "Batched reading failed: Batching failed. Falling back to regular reading - "
                    "this may cause memory issues for large files"
                )

                # Verify regular read was used as fallback
                mock_parquet_file.read.assert_called_once()

                # Verify result
                self.assertEqual(result, [sample_data])

    def test_exception_handling_with_failed_fallback(self):
        """Test when both batching and fallback fail"""
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Make both iter_batches and read fail
        mock_parquet_file.iter_batches.side_effect = Exception("Batching failed")
        mock_parquet_file.read.side_effect = Exception("Regular read failed")

        with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:
            # Should raise the fallback exception
            with self.assertRaises(Exception) as context:
                self.reader._read_parquet_in_batches(mock_parquet_file)

            self.assertEqual(str(context.exception), "Regular read failed")

            # Verify error was logged
            mock_logger.error.assert_called_with(
                "Failed to read parquet file: Regular read failed"
            )

    def test_custom_batch_size(self):
        """Test that custom batch size is properly passed to iter_batches"""
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Create a simple batch
        batch_data = pd.DataFrame({"id": [1], "name": ["Alice"]})
        mock_batch = Mock()
        mock_batch.to_pandas.return_value = batch_data

        mock_parquet_file.iter_batches.return_value = iter([mock_batch])

        with patch(
            "metadata.readers.dataframe.parquet.dataframe_to_chunks"
        ) as mock_chunks:
            mock_chunks.return_value = [batch_data]

            # Test with custom batch size
            custom_batch_size = 5000
            self.reader._read_parquet_in_batches(
                mock_parquet_file, batch_size=custom_batch_size
            )

            # Verify custom batch size was used
            mock_parquet_file.iter_batches.assert_called_once_with(
                batch_size=custom_batch_size
            )

    def test_batch_counting_and_logging(self):
        """Test that batch counting and logging work correctly"""
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Create multiple batches
        batches_data = [
            pd.DataFrame({"id": [1], "name": ["Alice"]}),
            pd.DataFrame({"id": [2], "name": ["Bob"]}),
            pd.DataFrame({"id": [3], "name": ["Charlie"]}),
        ]

        mock_batches = []
        for data in batches_data:
            mock_batch = Mock()
            mock_batch.to_pandas.return_value = data
            mock_batches.append(mock_batch)

        mock_parquet_file.iter_batches.return_value = iter(mock_batches)

        with patch(
            "metadata.readers.dataframe.parquet.dataframe_to_chunks"
        ) as mock_chunks:
            mock_chunks.side_effect = lambda df: [df] if not df.empty else []

            with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:
                result = self.reader._read_parquet_in_batches(mock_parquet_file)

                # Verify info logs were called
                mock_logger.info.assert_any_call(
                    "Reading large parquet file in batches to avoid memory issues"
                )
                mock_logger.info.assert_any_call(
                    "Successfully processed 3 batches from large parquet file"
                )

                # Verify all batches were processed
                self.assertEqual(len(result), 3)

    def test_dataframe_to_chunks_integration(self):
        """Test integration with dataframe_to_chunks function"""
        mock_parquet_file = Mock()
        mock_parquet_file.iter_batches = Mock()

        # Create a batch that would generate multiple chunks
        batch_data = pd.DataFrame(
            {
                "id": list(
                    range(1000)
                ),  # Large enough to potentially create multiple chunks
                "name": [f"User{i}" for i in range(1000)],
            }
        )

        mock_batch = Mock()
        mock_batch.to_pandas.return_value = batch_data
        mock_parquet_file.iter_batches.return_value = iter([mock_batch])

        # Use real dataframe_to_chunks function
        result = self.reader._read_parquet_in_batches(mock_parquet_file)

        # Verify that result is a list of dataframes (chunks)
        self.assertIsInstance(result, list)
        self.assertTrue(len(result) > 0)

        # Verify that all chunks are DataFrames
        for chunk in result:
            self.assertIsInstance(chunk, pd.DataFrame)

        # Verify that concatenating all chunks gives us back the original data
        if len(result) > 1:
            concatenated = pd.concat(result, ignore_index=True)
            pd.testing.assert_frame_equal(concatenated, batch_data)
        else:
            pd.testing.assert_frame_equal(result[0], batch_data)

    def test_real_parquet_file_batch_processing(self):
        """Test batch processing with real flights-1m.parquet file (7MB)"""
        import os

        from pyarrow.parquet import ParquetFile

        # Path to the test parquet file
        test_file_path = os.path.join(
            os.path.dirname(__file__), "test_files", "flights-1m.parquet"
        )

        # Skip test if file doesn't exist
        if not os.path.exists(test_file_path):
            self.skipTest(f"Test file not found: {test_file_path}")

        try:
            # Create ParquetFile object from real file
            parquet_file = ParquetFile(test_file_path)

            # Get some basic info about the file
            file_size = os.path.getsize(test_file_path)
            total_rows = parquet_file.metadata.num_rows

            print(
                f"Testing with real parquet file: {file_size} bytes, {total_rows} rows"
            )

            result = self.reader._read_parquet_in_batches(parquet_file)
            fallback_method_result = dataframe_to_chunks(
                parquet_file.read().to_pandas()
            )
            result_processed_rows = sum(len(chunk) for chunk in result)
            fallback_method_processed_rows = sum(
                len(chunk) for chunk in fallback_method_result
            )

            self.assertEqual(result_processed_rows, fallback_method_processed_rows)

        except Exception as e:
            self.fail(f"Real parquet file test failed: {e}")


if __name__ == "__main__":
    unittest.main()
feat-22574: Datalake ingestion fix for larger files (#22575) 2025-07-29 20:40:19 +05:30			`# Copyright 2025 Collate`
			`# Licensed under the Collate Community License, Version 1.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`Tests for ParquetDataFrameReader _read_parquet_in_batches method`
			`"""`
			`import unittest`
			`from unittest.mock import Mock, patch`

			`import pandas as pd`

			`from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (`
			`LocalConfig,`
			`)`
			`from metadata.readers.dataframe.common import dataframe_to_chunks`
			`from metadata.readers.dataframe.parquet import ParquetDataFrameReader`


			`class TestParquetBatchReading(unittest.TestCase):`
			`"""`
			`Test the _read_parquet_in_batches method functionality`
			`"""`

			`def setUp(self):`
			`"""Set up test fixtures"""`
			`self.config_source = LocalConfig()`
			`self.reader = ParquetDataFrameReader(self.config_source, None)`

			`def test_successful_batch_reading_with_iter_batches(self):`
			`"""Test successful batched reading when iter_batches is available"""`
			`# Create mock parquet file with iter_batches capability`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Create sample data for two batches`
			`batch1_data = pd.DataFrame(`
			`{"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}`
			`)`
			`batch2_data = pd.DataFrame(`
			`{"id": [4, 5], "name": ["David", "Eve"], "age": [40, 45]}`
			`)`

			`# Create mock arrow table batches`
			`mock_batch1 = Mock()`
			`mock_batch1.to_pandas.return_value = batch1_data`
			`mock_batch2 = Mock()`
			`mock_batch2.to_pandas.return_value = batch2_data`

			`mock_parquet_file.iter_batches.return_value = iter([mock_batch1, mock_batch2])`

			`with patch(`
			`"metadata.readers.dataframe.parquet.dataframe_to_chunks"`
			`) as mock_chunks:`
			`# Mock dataframe_to_chunks to return list of chunks per dataframe`
			`mock_chunks.side_effect = lambda df: [df] if not df.empty else []`

			`# Test the method`
			`result = self.reader._read_parquet_in_batches(`
			`mock_parquet_file, batch_size=1000`
			`)`

			`# Verify iter_batches was called with correct batch_size`
			`mock_parquet_file.iter_batches.assert_called_once_with(batch_size=1000)`

			`# Verify dataframe_to_chunks was called twice (once per batch)`
			`self.assertEqual(mock_chunks.call_count, 2)`

			`# Verify result contains chunks from both batches`
			`self.assertEqual(len(result), 2)`
			`pd.testing.assert_frame_equal(result[0], batch1_data)`
			`pd.testing.assert_frame_equal(result[1], batch2_data)`

			`def test_batch_reading_with_empty_batches(self):`
			`"""Test that empty batches are properly skipped"""`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Create one non-empty and one empty batch`
			`non_empty_batch = pd.DataFrame({"id": [1], "name": ["Alice"]})`
			`empty_batch = pd.DataFrame()`

			`mock_batch1 = Mock()`
			`mock_batch1.to_pandas.return_value = non_empty_batch`
			`mock_batch2 = Mock()`
			`mock_batch2.to_pandas.return_value = empty_batch`

			`mock_parquet_file.iter_batches.return_value = iter([mock_batch1, mock_batch2])`

			`with patch(`
			`"metadata.readers.dataframe.parquet.dataframe_to_chunks"`
			`) as mock_chunks:`
			`mock_chunks.side_effect = lambda df: [df] if not df.empty else []`

			`result = self.reader._read_parquet_in_batches(mock_parquet_file)`

			`# Should only call dataframe_to_chunks once for non-empty batch`
			`mock_chunks.assert_called_once_with(non_empty_batch)`

			`# Result should only contain the non-empty batch`
			`self.assertEqual(len(result), 1)`
			`pd.testing.assert_frame_equal(result[0], non_empty_batch)`

			`def test_exception_handling_with_successful_fallback(self):`
			`"""Test exception handling when batching fails but fallback succeeds"""`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Make iter_batches raise an exception`
			`mock_parquet_file.iter_batches.side_effect = Exception("Batching failed")`

			`# But make regular read work`
			`sample_data = pd.DataFrame({"id": [1, 2], "name": ["Alice", "Bob"]})`
			`mock_arrow_table = Mock()`
			`mock_arrow_table.to_pandas.return_value = sample_data`
			`mock_parquet_file.read.return_value = mock_arrow_table`

			`with patch(`
			`"metadata.readers.dataframe.parquet.dataframe_to_chunks"`
			`) as mock_chunks:`
			`mock_chunks.return_value = [sample_data]`

			`with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:`
			`result = self.reader._read_parquet_in_batches(mock_parquet_file)`

			`# Verify warning was logged about fallback`
			`mock_logger.warning.assert_called_with(`
			`"Batched reading failed: Batching failed. Falling back to regular reading - "`
			`"this may cause memory issues for large files"`
			`)`

			`# Verify regular read was used as fallback`
			`mock_parquet_file.read.assert_called_once()`

			`# Verify result`
			`self.assertEqual(result, [sample_data])`

			`def test_exception_handling_with_failed_fallback(self):`
			`"""Test when both batching and fallback fail"""`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Make both iter_batches and read fail`
			`mock_parquet_file.iter_batches.side_effect = Exception("Batching failed")`
			`mock_parquet_file.read.side_effect = Exception("Regular read failed")`

			`with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:`
			`# Should raise the fallback exception`
			`with self.assertRaises(Exception) as context:`
			`self.reader._read_parquet_in_batches(mock_parquet_file)`

			`self.assertEqual(str(context.exception), "Regular read failed")`

			`# Verify error was logged`
			`mock_logger.error.assert_called_with(`
			`"Failed to read parquet file: Regular read failed"`
			`)`

			`def test_custom_batch_size(self):`
			`"""Test that custom batch size is properly passed to iter_batches"""`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Create a simple batch`
			`batch_data = pd.DataFrame({"id": [1], "name": ["Alice"]})`
			`mock_batch = Mock()`
			`mock_batch.to_pandas.return_value = batch_data`

			`mock_parquet_file.iter_batches.return_value = iter([mock_batch])`

			`with patch(`
			`"metadata.readers.dataframe.parquet.dataframe_to_chunks"`
			`) as mock_chunks:`
			`mock_chunks.return_value = [batch_data]`

			`# Test with custom batch size`
			`custom_batch_size = 5000`
			`self.reader._read_parquet_in_batches(`
			`mock_parquet_file, batch_size=custom_batch_size`
			`)`

			`# Verify custom batch size was used`
			`mock_parquet_file.iter_batches.assert_called_once_with(`
			`batch_size=custom_batch_size`
			`)`

			`def test_batch_counting_and_logging(self):`
			`"""Test that batch counting and logging work correctly"""`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Create multiple batches`
			`batches_data = [`
			`pd.DataFrame({"id": [1], "name": ["Alice"]}),`
			`pd.DataFrame({"id": [2], "name": ["Bob"]}),`
			`pd.DataFrame({"id": [3], "name": ["Charlie"]}),`
			`]`

			`mock_batches = []`
			`for data in batches_data:`
			`mock_batch = Mock()`
			`mock_batch.to_pandas.return_value = data`
			`mock_batches.append(mock_batch)`

			`mock_parquet_file.iter_batches.return_value = iter(mock_batches)`

			`with patch(`
			`"metadata.readers.dataframe.parquet.dataframe_to_chunks"`
			`) as mock_chunks:`
			`mock_chunks.side_effect = lambda df: [df] if not df.empty else []`

			`with patch("metadata.readers.dataframe.parquet.logger") as mock_logger:`
			`result = self.reader._read_parquet_in_batches(mock_parquet_file)`

			`# Verify info logs were called`
			`mock_logger.info.assert_any_call(`
			`"Reading large parquet file in batches to avoid memory issues"`
			`)`
			`mock_logger.info.assert_any_call(`
			`"Successfully processed 3 batches from large parquet file"`
			`)`

			`# Verify all batches were processed`
			`self.assertEqual(len(result), 3)`

			`def test_dataframe_to_chunks_integration(self):`
			`"""Test integration with dataframe_to_chunks function"""`
			`mock_parquet_file = Mock()`
			`mock_parquet_file.iter_batches = Mock()`

			`# Create a batch that would generate multiple chunks`
			`batch_data = pd.DataFrame(`
			`{`
			`"id": list(`
			`range(1000)`
			`), # Large enough to potentially create multiple chunks`
			`"name": [f"User{i}" for i in range(1000)],`
			`}`
			`)`

			`mock_batch = Mock()`
			`mock_batch.to_pandas.return_value = batch_data`
			`mock_parquet_file.iter_batches.return_value = iter([mock_batch])`

			`# Use real dataframe_to_chunks function`
			`result = self.reader._read_parquet_in_batches(mock_parquet_file)`

			`# Verify that result is a list of dataframes (chunks)`
			`self.assertIsInstance(result, list)`
			`self.assertTrue(len(result) > 0)`

			`# Verify that all chunks are DataFrames`
			`for chunk in result:`
			`self.assertIsInstance(chunk, pd.DataFrame)`

			`# Verify that concatenating all chunks gives us back the original data`
			`if len(result) > 1:`
			`concatenated = pd.concat(result, ignore_index=True)`
			`pd.testing.assert_frame_equal(concatenated, batch_data)`
			`else:`
			`pd.testing.assert_frame_equal(result[0], batch_data)`

			`def test_real_parquet_file_batch_processing(self):`
			`"""Test batch processing with real flights-1m.parquet file (7MB)"""`
			`import os`

			`from pyarrow.parquet import ParquetFile`

			`# Path to the test parquet file`
			`test_file_path = os.path.join(`
			`os.path.dirname(__file__), "test_files", "flights-1m.parquet"`
			`)`

			`# Skip test if file doesn't exist`
			`if not os.path.exists(test_file_path):`
			`self.skipTest(f"Test file not found: {test_file_path}")`

			`try:`
			`# Create ParquetFile object from real file`
			`parquet_file = ParquetFile(test_file_path)`

			`# Get some basic info about the file`
			`file_size = os.path.getsize(test_file_path)`
			`total_rows = parquet_file.metadata.num_rows`

			`print(`
			`f"Testing with real parquet file: {file_size} bytes, {total_rows} rows"`
			`)`

			`result = self.reader._read_parquet_in_batches(parquet_file)`
			`fallback_method_result = dataframe_to_chunks(`
			`parquet_file.read().to_pandas()`
			`)`
			`result_processed_rows = sum(len(chunk) for chunk in result)`
			`fallback_method_processed_rows = sum(`
			`len(chunk) for chunk in fallback_method_result`
			`)`

			`self.assertEqual(result_processed_rows, fallback_method_processed_rows)`

			`except Exception as e:`
			`self.fail(f"Real parquet file test failed: {e}")`


			`if __name__ == "__main__":`
			`unittest.main()`