OpenMetadata/ingestion/tests/unit/readers/test_df_reader.py

#  Copyright 2025 Collate
#  Licensed under the Collate Community License, Version 1.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Validate factory and logic to read dataframes from local.
"""
from pathlib import Path
from unittest import TestCase

from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
    LocalConfig,
)
from metadata.readers.dataframe.models import DatalakeTableSchemaWrapper
from metadata.readers.dataframe.reader_factory import SupportedTypes
from metadata.utils.datalake.datalake_utils import fetch_dataframe

ROOT_PATH = Path(__file__).parent.parent / "resources" / "datalake"


class TestDataFrameReader(TestCase):
    """
    Load different files from resources and validate
    that the reader can properly get the df out ot if.
    """

    def test_dsv_no_extension_reader(self):
        key = ROOT_PATH / "transactions_1"

        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(
                key=str(key), bucket_name="unused", file_extension=SupportedTypes.CSV
            ),
        )

        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))

        self.assertEqual(df_list[0].shape, (5, 2))
        self.assertEqual(
            list(df_list[0].columns), ["transaction_id", "transaction_value"]
        )

    def test_dsv_reader(self):
        key = ROOT_PATH / "transactions_1.csv"

        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"),
        )

        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))

        self.assertEqual(df_list[0].shape, (5, 2))
        self.assertEqual(
            list(df_list[0].columns), ["transaction_id", "transaction_value"]
        )

    def test_dsv_reader_with_separator(self):
        key = ROOT_PATH / "transactions_separator.csv"

        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(
                key=str(key), bucket_name="unused", separator=";"
            ),
        )

        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))

        self.assertEqual(df_list[0].shape, (5, 2))
        self.assertEqual(
            list(df_list[0].columns), ["transaction_id", "transaction_value"]
        )

    def test_json_reader(self):
        key = ROOT_PATH / "employees.json"

        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"),
        )

        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))

        self.assertEqual(df_list[0].shape, (4, 4))
        self.assertEqual(
            list(df_list[0].columns),
            ["name", "id", "version", "Company"],
        )

    def test_jsonl_reader(self):
        key = ROOT_PATH / "employees.jsonl"

        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"),
        )

        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))

        self.assertEqual(df_list[0].shape, (4, 4))
        self.assertEqual(
            list(df_list[0].columns),
            ["name", "id", "version", "Company"],
        )

    def test_avro_reader(self):
        key = ROOT_PATH / "example.avro"

        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"),
        )

        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))

        self.assertEqual(df_list[0].shape, (4, 8))
        self.assertEqual(
            list(df_list[0].columns),
            [
                "Boolean",
                "pdBoolean",
                "Float64",
                "Int64",
                "pdInt64",
                "String",
                "pdString",
                "DateTime64",
            ],
        )