# Copyright 2022 Collate # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Test database connectors with CLI """ import os import re from abc import abstractmethod from contextlib import redirect_stdout from enum import Enum from io import StringIO from pathlib import Path from typing import List from unittest import TestCase import pytest import yaml from metadata.cmd import metadata from metadata.config.common import load_config_file from metadata.generated.schema.entity.data.table import Table from metadata.ingestion.api.sink import SinkStatus from metadata.ingestion.api.source import SourceStatus from metadata.ingestion.api.workflow import Workflow from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.utils.constants import UTF_8 PATH_TO_RESOURCES = os.path.dirname(os.path.realpath(__file__)) class E2EType(Enum): """ E2E Type Enum Class """ INGEST = "ingest" PROFILER = "profiler" INGEST_FILTER_SCHEMA = "ingest-filter-schema" INGEST_FILTER_TABLE = "ingest-filter-table" INGEST_FILTER_MIX = "ingest-filter-mix" class CliDBBase(TestCase): """ CLI DB Base class """ class TestSuite(TestCase): # pylint: disable=too-many-public-methods """ TestSuite class to define test structure """ catcher = StringIO() openmetadata: OpenMetadata test_file_path: str config_file_path: str # 1. deploy vanilla ingestion @pytest.mark.order(1) def test_vanilla_ingestion(self) -> None: # build config file for ingest self.build_config_file(E2EType.INGEST) # run ingest with new tables self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_for_vanilla_ingestion(source_status, sink_status) # 2. create a new table + deploy ingestion with views, sample data, and profiler @pytest.mark.order(2) def test_create_table_with_profiler(self) -> None: # delete table in case it exists self.delete_table_and_view() # create a table and a view self.create_table_and_view() # build config file for ingest self.build_config_file() # run ingest with new tables self.run_command() self.catcher.truncate(0) # build config file for profiler self.build_config_file(E2EType.PROFILER) # run profiler with new tables self.run_command("profile") result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_for_table_with_profiler(source_status, sink_status) # 3. delete the new table + deploy marking tables as deleted @pytest.mark.order(3) def test_delete_table_is_marked_as_deleted(self) -> None: # delete table created in previous test self.delete_table_and_view() # build config file for ingest self.build_config_file() # run ingest self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_for_delete_table_is_marked_as_deleted( source_status, sink_status ) # 4. vanilla ingestion + include schema filter pattern @pytest.mark.order(4) def test_schema_filter_includes(self) -> None: # build config file for ingest with filters self.build_config_file( E2EType.INGEST_FILTER_SCHEMA, {"includes": self.get_includes_schemas()} ) # run ingest self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_filtered_schemas_includes(source_status, sink_status) # 5. vanilla ingestion + exclude schema filter pattern @pytest.mark.order(5) def test_schema_filter_excludes(self) -> None: # build config file for ingest with filters self.build_config_file( E2EType.INGEST_FILTER_SCHEMA, {"excludes": self.get_includes_schemas()} ) # run ingest self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_filtered_schemas_excludes(source_status, sink_status) # 6. Vanilla ingestion + include table filter pattern @pytest.mark.order(6) def test_table_filter_includes(self) -> None: # build config file for ingest with filters self.build_config_file( E2EType.INGEST_FILTER_TABLE, {"includes": self.get_includes_tables()} ) # run ingest self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_filtered_tables_includes(source_status, sink_status) # 7. Vanilla ingestion + include table filter pattern @pytest.mark.order(7) def test_table_filter_excludes(self) -> None: # build config file for ingest with filters self.build_config_file( E2EType.INGEST_FILTER_TABLE, {"excludes": self.get_includes_tables()} ) # run ingest self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_filtered_tables_excludes(source_status, sink_status) # 8. Vanilla ingestion mixing filters @pytest.mark.order(8) def test_table_filter_mix(self) -> None: # build config file for ingest with filters self.build_config_file( E2EType.INGEST_FILTER_MIX, { "schema": {"includes": self.get_includes_schemas()}, "table": { "includes": self.get_includes_tables(), "excludes": self.get_excludes_tables(), }, }, ) # run ingest self.run_command() result = self.catcher.getvalue() self.catcher.truncate(0) sink_status, source_status = self.retrieve_statuses(result) self.assert_filtered_mix(source_status, sink_status) # 9. Run usage @pytest.mark.order(9) def test_usage(self) -> None: # to be implemented pass # 10. Run queries in the source (creates, inserts, views) and ingest metadata & Lineage @pytest.mark.order(10) def test_lineage(self) -> None: # to be implemented pass def run_command(self, command: str = "ingest"): args = [ command, "-c", self.test_file_path, ] with redirect_stdout(self.catcher): with self.assertRaises(SystemExit): metadata(args) def build_config_file( self, test_type: E2EType = E2EType.INGEST, extra_args: dict = None ) -> None: with open(self.config_file_path, encoding=UTF_8) as config_file: config_yaml = yaml.safe_load(config_file) config_yaml = self.build_yaml(config_yaml, test_type, extra_args) with open(self.test_file_path, "w", encoding=UTF_8) as test_file: yaml.dump(config_yaml, test_file) def retrieve_statuses(self, result): source_status: SourceStatus = self.extract_source_status(result) sink_status: SinkStatus = self.extract_sink_status(result) return sink_status, source_status def retrieve_table(self, table_name_fqn: str) -> Table: return self.openmetadata.get_by_name(entity=Table, fqn=table_name_fqn) def retrieve_sample_data(self, table_name_fqn: str) -> Table: table: Table = self.openmetadata.get_by_name( entity=Table, fqn=table_name_fqn ) return self.openmetadata.get_sample_data(table=table) def retrieve_lineage(self, table_name_fqn: str) -> dict: return self.openmetadata.client.get( f"/lineage/table/name/{table_name_fqn}?upstreamDepth=3&downstreamDepth=3" ) @staticmethod def get_workflow(connector: str) -> Workflow: config_file = Path( PATH_TO_RESOURCES + f"/database/{connector}/{connector}.yaml" ) config_dict = load_config_file(config_file) return Workflow.create(config_dict) @staticmethod def extract_source_status(output) -> SourceStatus: output_clean = output.replace("\n", " ") output_clean = re.sub(" +", " ", output_clean) output_clean_ansi = re.compile(r"\x1b[^m]*m") output_clean = output_clean_ansi.sub(" ", output_clean) if re.match(".* Processor Status: .*", output_clean): output_clean = re.findall( "Source Status: (.*?) Processor Status: .*", output_clean.strip() ) else: output_clean = re.findall( "Source Status: (.*?) Sink Status: .*", output_clean.strip() ) return SourceStatus.parse_obj( eval(output_clean[0].strip()) # pylint: disable=eval-used ) @staticmethod def extract_sink_status(output) -> SinkStatus: output_clean = output.replace("\n", " ") output_clean = re.sub(" +", " ", output_clean) output_clean_ansi = re.compile(r"\x1b[^m]*m") output_clean = output_clean_ansi.sub("", output_clean) output_clean = re.findall( ".* Sink Status: (.*?) Workflow finished.*", output_clean.strip() )[0].strip() return SinkStatus.parse_obj(eval(output_clean)) # pylint: disable=eval-used @staticmethod @abstractmethod def get_connector_name() -> str: raise NotImplementedError() @abstractmethod def create_table_and_view(self) -> None: raise NotImplementedError() @abstractmethod def delete_table_and_view(self) -> None: raise NotImplementedError() @abstractmethod def assert_for_vanilla_ingestion( self, source_status: SourceStatus, sink_status: SinkStatus ) -> None: raise NotImplementedError() @abstractmethod def assert_for_table_with_profiler( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @abstractmethod def assert_for_delete_table_is_marked_as_deleted( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @abstractmethod def assert_filtered_schemas_includes( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @abstractmethod def assert_filtered_schemas_excludes( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @abstractmethod def assert_filtered_tables_includes( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @abstractmethod def assert_filtered_tables_excludes( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @abstractmethod def assert_filtered_mix( self, source_status: SourceStatus, sink_status: SinkStatus ): raise NotImplementedError() @staticmethod @abstractmethod def get_includes_schemas() -> List[str]: raise NotImplementedError() @staticmethod @abstractmethod def get_includes_tables() -> List[str]: raise NotImplementedError() @staticmethod @abstractmethod def get_excludes_tables() -> List[str]: raise NotImplementedError() @staticmethod def build_yaml(config_yaml: dict, test_type: E2EType, extra_args: dict): """ Build yaml as per E2EType """ if test_type == E2EType.PROFILER: del config_yaml["source"]["sourceConfig"]["config"] config_yaml["source"]["sourceConfig"] = { "config": { "type": "Profiler", "generateSampleData": True, "profileSample": 1, } } config_yaml["processor"] = {"type": "orm-profiler", "config": {}} if test_type == E2EType.INGEST_FILTER_SCHEMA: config_yaml["source"]["sourceConfig"]["config"][ "schemaFilterPattern" ] = extra_args if test_type == E2EType.INGEST_FILTER_TABLE: config_yaml["source"]["sourceConfig"]["config"][ "tableFilterPattern" ] = extra_args if test_type == E2EType.INGEST_FILTER_MIX: config_yaml["source"]["sourceConfig"]["config"][ "schemaFilterPattern" ] = extra_args["schema"] config_yaml["source"]["sourceConfig"]["config"][ "tableFilterPattern" ] = extra_args["table"] return config_yaml