OpenMetadata/ingestion/tests/integration/datalake/test_ingestion.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""Datalake ingestion integration tests"""

import pytest

from metadata.generated.schema.entity.data.table import DataType, Table
from metadata.ingestion.ometa.models import EntityList
from metadata.ingestion.ometa.ometa_api import OpenMetadata


class TestDatalake:
    """datalake profiler E2E test"""

    metadata: OpenMetadata = None
    s3_client = None

    @pytest.fixture(autouse=True)
    def set_metdata(self, metadata):
        self.metadata = metadata

    @pytest.mark.order(10000)
    def test_ingestion(self, run_ingestion):
        """test ingestion of datalake data"""
        # Ingest our S3 data
        resp: EntityList[Table] = self.metadata.list_entities(
            entity=Table, params={"database": "datalake_for_integration_tests.default"}
        )  # type: ignore

        entities = resp.entities
        assert len(entities) == 3
        names = [entity.name.__root__ for entity in entities]
        assert sorted(["names.json", "new_users.parquet", "users.csv"]) == sorted(names)

        for entity in entities:
            columns = entity.columns
            for column in columns:
                if column.dataType == DataType.JSON:
                    assert column.children

    def test_profiler(self, run_profiler):
        csv_ = self.metadata.get_by_name(
            entity=Table,
            fqn='datalake_for_integration_tests.default.MyBucket."users.csv"',
            fields=["tableProfilerConfig"],
        )
        parquet_ = self.metadata.get_by_name(
            entity=Table,
            fqn='datalake_for_integration_tests.default.MyBucket."new_users.parquet"',
            fields=["tableProfilerConfig"],
        )
        json_ = self.metadata.get_by_name(
            entity=Table,
            fqn='datalake_for_integration_tests.default.MyBucket."names.json"',
            fields=["tableProfilerConfig"],
        )
        csv_sample_data = self.metadata.get_sample_data(csv_)
        parquet_sample_data = self.metadata.get_sample_data(parquet_)
        json_sample_data = self.metadata.get_sample_data(json_)

        assert csv_sample_data.sampleData.rows
        assert parquet_sample_data.sampleData.rows
        assert json_sample_data.sampleData.rows
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""Datalake ingestion integration tests"""`

			`import pytest`

			`from metadata.generated.schema.entity.data.table import DataType, Table`
			`from metadata.ingestion.ometa.models import EntityList`
			`from metadata.ingestion.ometa.ometa_api import OpenMetadata`


Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`class TestDatalake:`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`"""datalake profiler E2E test"""`

Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`metadata: OpenMetadata = None`
			`s3_client = None`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00
Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`@pytest.fixture(autouse=True)`
			`def set_metdata(self, metadata):`
			`self.metadata = metadata`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00
			`@pytest.mark.order(10000)`
Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`def test_ingestion(self, run_ingestion):`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`"""test ingestion of datalake data"""`
			`# Ingest our S3 data`
			`resp: EntityList[Table] = self.metadata.list_entities(`
			`entity=Table, params={"database": "datalake_for_integration_tests.default"}`
			`) # type: ignore`

			`entities = resp.entities`
Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`assert len(entities) == 3`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`names = [entity.name.__root__ for entity in entities]`
Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`assert sorted(["names.json", "new_users.parquet", "users.csv"]) == sorted(names)`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00
			`for entity in entities:`
			`columns = entity.columns`
			`for column in columns:`
			`if column.dataType == DataType.JSON:`
			`assert column.children`

Fixes 16305: Added Test Case for Matching Enum (#16362) * Added Test Case for Matching Enum 1. Implemented the test case using the `matchEnum` parameter. 2. Added integration tests. 3. Added migrations. * fix tests * fixed tests * format * fixed tests * clear search cache before running ingestion * format * changed scopt of aws fixture * moved migrations to 1.5.0 2024-05-28 09:30:30 +02:00			`def test_profiler(self, run_profiler):`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`csv_ = self.metadata.get_by_name(`
			`entity=Table,`
			`fqn='datalake_for_integration_tests.default.MyBucket."users.csv"',`
			`fields=["tableProfilerConfig"],`
			`)`
			`parquet_ = self.metadata.get_by_name(`
			`entity=Table,`
			`fqn='datalake_for_integration_tests.default.MyBucket."new_users.parquet"',`
			`fields=["tableProfilerConfig"],`
			`)`
			`json_ = self.metadata.get_by_name(`
			`entity=Table,`
			`fqn='datalake_for_integration_tests.default.MyBucket."names.json"',`
			`fields=["tableProfilerConfig"],`
			`)`
			`csv_sample_data = self.metadata.get_sample_data(csv_)`
			`parquet_sample_data = self.metadata.get_sample_data(parquet_)`
			`json_sample_data = self.metadata.get_sample_data(json_)`

			`assert csv_sample_data.sampleData.rows`
			`assert parquet_sample_data.sampleData.rows`
			`assert json_sample_data.sampleData.rows`