Fix #3826 & #3886 - Profiler workflow & filter pattern (#3893)

2025-10-16 19:28:21 +00:00 · 2022-04-06 17:05:00 +02:00 · 2022-04-06 17:05:00 +02:00 · bd4071bd64
commit bd4071bd64
parent 7292695bd3
8 changed files with 178 additions and 189 deletions
--- a/ingestion/src/metadata/ingestion/api/common.py
+++ b/ingestion/src/metadata/ingestion/api/common.py
@ -35,42 +35,3 @@ class DynamicTypedConfig(ConfigModel):
 class WorkflowExecutionError(Exception):
    """An error occurred when executing the workflow"""
 class IncludeFilterPattern(ConfigModel):
    """A class to store allow deny regexes"""
    includes: List[str] = [".*"]
    excludes: List[str] = []
    alphabet: str = "[A-Za-z0-9 _.-]"
    @property
    def alphabet_pattern(self):
        return re.compile(f"^{self.alphabet}+$")
    @classmethod
    def allow_all(cls):
        return IncludeFilterPattern()
    def included(self, string: str) -> bool:
        try:
            for exclude in self.excludes:
                if re.match(exclude, string):
                    return False
            for include in self.includes:
                if re.match(include, string):
                    return True
            return False
        except Exception as err:
            raise Exception("Regex Error: {}".format(err))
    def is_fully_specified_include_list(self) -> bool:
        for filter_pattern in self.includes:
            if not self.alphabet_pattern.match(filter_pattern):
                return False
        return True
    def get_allowed_list(self):
        assert self.is_fully_specified_include_list()
        return [a for a in self.includes if self.included(a)]
--- a/ingestion/src/metadata/ingestion/source/sql_source.py
+++ b/ingestion/src/metadata/ingestion/source/sql_source.py
@ -16,6 +16,7 @@ import logging
 import re
 import traceback
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Dict, Iterable, List, Optional, Tuple
@ -56,7 +57,6 @@ from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable
 from metadata.ingestion.models.table_metadata import DeleteTable
 from metadata.ingestion.ometa.client import APIError
 from metadata.ingestion.ometa.ometa_api import OpenMetadata
 from metadata.ingestion.source.sql_source_common import SQLSourceStatus
 from metadata.orm_profiler.orm.converter import ometa_to_orm
 from metadata.orm_profiler.profiler.default import DefaultProfiler
 from metadata.utils.column_type_parser import ColumnTypeParser
@ -66,6 +66,26 @@ from metadata.utils.helpers import get_database_service_or_create, ingest_lineag
 logger: logging.Logger = logging.getLogger(__name__)
@dataclass
 class SQLSourceStatus(SourceStatus):
    """
    Reports the source status after ingestion
    """
    success: List[str] = field(default_factory=list)
    failures: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)
    filtered: List[str] = field(default_factory=list)
    def scanned(self, record: str) -> None:
        self.success.append(record)
        logger.info(f"Table Scanned: {record}")
    def filter(self, record: str, err: str) -> None:
        self.filtered.append(record)
        logger.warning(f"Filtered Table {record} due to {err}")
 def _get_table_description(schema: str, table: str, inspector: Inspector) -> str:
    description = None
    try:
@ -105,7 +125,7 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
        self.service = get_database_service_or_create(config, metadata_config)
        self.metadata = OpenMetadata(metadata_config)
        self.status = SQLSourceStatus()
-        self.engine = get_engine(config=self.config)
+        self.engine = get_engine(workflow_source=self.config)
        self._session = None  # We will instantiate this just if needed
        self.connection = self.engine.connect()
        self.data_profiler = None
--- a/ingestion/src/metadata/ingestion/source/sql_source_common.py
+++ b/ingestion/src/metadata/ingestion/source/sql_source_common.py
@ -1,100 +0,0 @@
 import logging
 from abc import abstractmethod
 from dataclasses import dataclass, field
 from typing import List, Optional
 from urllib.parse import quote_plus
 from pydantic import SecretStr
 from metadata.generated.schema.entity.services.databaseService import (
    DatabaseServiceType,
 )
 from metadata.generated.schema.operations.pipelines.databaseServiceMetadataPipeline import (
    DatabaseServiceMetadataPipeline,
 )
 from metadata.ingestion.api.source import SourceStatus
 logger: logging.Logger = logging.getLogger(__name__)
@dataclass
 class SQLSourceStatus(SourceStatus):
    """
    Reports the source status after ingestion
    """
    success: List[str] = field(default_factory=list)
    failures: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)
    filtered: List[str] = field(default_factory=list)
    def scanned(self, record: str) -> None:
        self.success.append(record)
        logger.info(f"Table Scanned: {record}")
    def filter(self, record: str, err: str) -> None:
        self.filtered.append(record)
        logger.warning(f"Filtered Table {record} due to {err}")
 def wbuild_sql_source_connection_url(
    host_port: str,
    scheme: str,
    username: Optional[str] = None,
    password: Optional[SecretStr] = None,
    database: Optional[str] = None,
    options: Optional[dict] = None,
 ) -> str:
    """
    Helper function to prepare the db URL
    """
    url = f"{scheme}://"
    if username is not None:
        url += f"{username}"
        if password is not None:
            url += f":{quote_plus(password.get_secret_value())}"
        url += "@"
    url += f"{host_port}"
    if database:
        url += f"/{database}"
    if options is not None:
        if database is None:
            url += "/"
        params = "&".join(
            f"{key}={quote_plus(value)}" for (key, value) in options.items() if value
        )
        url = f"{url}?{params}"
    return url
 class SQLConnectionConfig(DatabaseServiceMetadataPipeline):
    """
    Config class containing all supported
    configurations for an SQL source, including
    data profiling and DBT generated information.
    """
    service_name: str
    db_schema: Optional[str] = None
    options: dict = {}
    connect_args: dict = {}
    include_tables: Optional[bool] = True
    @abstractmethod
    def get_connection_url(self):
        return build_sql_source_connection_url(
            host_port=self.hostPort,
            scheme=self.scheme,
            username=self.username,
            password=self.password,
            database=self.database,
            options=self.options,
        )
    def get_service_type(self) -> DatabaseServiceType:
        return DatabaseServiceType[self.type]
    def get_service_name(self) -> str:
        return self.service_name
--- a/ingestion/src/metadata/orm_profiler/api/workflow.py
+++ b/ingestion/src/metadata/orm_profiler/api/workflow.py
@ -26,6 +26,9 @@ from metadata.config.common import WorkflowExecutionError
 from metadata.config.workflow import get_ingestion_source, get_processor, get_sink
 from metadata.generated.schema.entity.data.database import Database
 from metadata.generated.schema.entity.data.table import Table
 from metadata.generated.schema.metadataIngestion.databaseServiceMetadataPipeline import (
    DatabaseServiceMetadataPipeline,
 )
 from metadata.generated.schema.metadataIngestion.workflow import (
    OpenMetadataServerConfig,
    OpenMetadataWorkflowConfig,
@ -34,14 +37,11 @@ from metadata.ingestion.api.processor import Processor
 from metadata.ingestion.api.sink import Sink
 from metadata.ingestion.api.source import Source
 from metadata.ingestion.ometa.ometa_api import OpenMetadata
-from metadata.ingestion.source.sql_source import SQLSource
+from metadata.ingestion.source.sql_source import SQLSource, SQLSourceStatus
 from metadata.ingestion.source.sql_source_common import (
    SQLConnectionConfig,
    SQLSourceStatus,
 )
 from metadata.orm_profiler.api.models import ProfilerProcessorConfig, ProfilerResponse
 from metadata.orm_profiler.utils import logger
-from metadata.utils.engines import create_and_bind_session, get_engine
+from metadata.utils.engines import create_and_bind_session
 from metadata.utils.filters import filter_by_schema, filter_by_table
 logger = logger()
@ -64,7 +64,7 @@ class ProfilerWorkflow:
        )
        # We will use the existing sources to build the Engine
-        self.source = get_ingestion_source(
+        self.source: Source = get_ingestion_source(
            source_type=self.config.source.type,
            source_config=self.config.source,
            metadata_config=self.metadata_config,
@ -76,7 +76,9 @@ class ProfilerWorkflow:
            )
        # Init and type the source config
-        self.source_config: SQLConnectionConfig = self.source.config
+        self.source_config: DatabaseServiceMetadataPipeline = (
            self.config.source.sourceConfig.config
        )
        self.source_status = SQLSourceStatus()
        self.processor = get_processor(
@ -85,7 +87,7 @@ class ProfilerWorkflow:
            metadata_config=self.metadata_config,
            _from="orm_profiler",
            # Pass the session as kwargs for the profiler
-            session=create_and_bind_session(get_engine(self.source_config)),
+            session=create_and_bind_session(self.source.engine),
        )
        if self.config.sink:
@ -121,20 +123,22 @@ class ProfilerWorkflow:
        for table in tables:
            # Validate schema
-            if not self.source_config.schema_filter_pattern.included(
+            if filter_by_schema(
-                table.database.name
+                schema_filter_pattern=self.source_config.schemaFilterPattern,
                schema_name=table.databaseSchema.name,
            ):
                self.source_status.filter(
-                    table.database.name, "Schema pattern not allowed"
+                    table.databaseSchema.name, "Schema pattern not allowed"
                )
                continue
            # Validate database
-            if not self.source_config.table_filter_pattern.included(
+            if filter_by_table(
-                str(table.name.__root__)
+                table_filter_pattern=self.source_config.tableFilterPattern,
                table_name=str(table.name.__root__),
            ):
                self.source_status.filter(
-                    table.fullyQualifiedName.__root__, "Table name pattern not allowed"
+                    table.name.__root__, "Table name pattern not allowed"
                )
                continue
--- a/ingestion/src/metadata/utils/engines.py
+++ b/ingestion/src/metadata/utils/engines.py
@ -27,12 +27,12 @@ from metadata.utils.source_connections import get_connection_url
 logger = logging.getLogger("Utils")
-def get_engine(config: WorkflowSource, verbose: bool = False) -> Engine:
+def get_engine(workflow_source: WorkflowSource, verbose: bool = False) -> Engine:
    """
    Given an SQL configuration, build the SQLAlchemy Engine
    """
-    logger.info(f"Building Engine for {config.serviceName}...")
+    logger.info(f"Building Engine for {workflow_source.serviceName}...")
-    service_connection_config = config.serviceConnection.__root__.config
+    service_connection_config = workflow_source.serviceConnection.__root__.config
    options = service_connection_config.connectionOptions
    if not options:
        options = {}
@ -40,7 +40,7 @@ def get_engine(config: WorkflowSource, verbose: bool = False) -> Engine:
    if not connect_args:
        connect_args = {}
    engine = create_engine(
-        get_connection_url(config.serviceConnection.__root__.config),
+        get_connection_url(service_connection_config),
        **options,
        connect_args=connect_args,
        echo=verbose,
--- a/ingestion/src/metadata/utils/filters.py
+++ b/ingestion/src/metadata/utils/filters.py
@ -0,0 +1,103 @@
 #  Copyright 2021 Collate
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #  http://www.apache.org/licenses/LICENSE-2.0
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 """
 Helper that implements table and filter pattern logic
 """
 import re
 from typing import List, Optional
 from metadata.generated.schema.type.filterPattern import FilterPatternModel
 class InvalidPatternException(Exception):
    """
    Raised when an invalid pattern is configured in the workflow
    """
 def validate_regex(regex_list: List[str]) -> None:
    """
    Check that the given include/exclude regexes
    are well formatted
    """
    for regex in regex_list:
        try:
            re.compile(regex)
        except re.error:
            raise InvalidPatternException(f"Invalid regex {regex}.")
 def _filter(filter_pattern: Optional[FilterPatternModel], name: str) -> bool:
    """
    Return True if the name needs to be filtered, False otherwise
    Include takes precedence over exclude
    :param filter_pattern: Model defining filtering logic
    :param name: table or schema name
    :return: True for filtering, False otherwise
    """
    if not filter_pattern:
        # No filter pattern, nothing to filter
        return False
    if filter_pattern.includes:
        validate_regex(filter_pattern.includes)
        return not any(
            [
                matched
                for regex in filter_pattern.includes
                if (matched := re.match(regex, name))
            ]
        )
    if filter_pattern.excludes:
        validate_regex(filter_pattern.excludes)
        return any(
            [
                matched
                for regex in filter_pattern.excludes
                if (matched := re.match(regex, name))
            ]
        )
    return False
 def filter_by_schema(
    schema_filter_pattern: Optional[FilterPatternModel], schema_name: str
 ) -> bool:
    """
    Return True if the schema needs to be filtered, False otherwise
    Include takes precedence over exclude
    :param schema_filter_pattern: Model defining schema filtering logic
    :param schema_name: table schema name
    :return: True for filtering, False otherwise
    """
    return _filter(schema_filter_pattern, schema_name)
 def filter_by_table(
    table_filter_pattern: Optional[FilterPatternModel], table_name: str
 ) -> bool:
    """
    Return True if the table needs to be filtered, False otherwise
    Include takes precedence over exclude
    :param table_filter_pattern: Model defining schema filtering logic
    :param table_name: table name
    :return: True for filtering, False otherwise
    """
    return _filter(table_filter_pattern, table_name)
--- a/ingestion/src/metadata/utils/helpers.py
+++ b/ingestion/src/metadata/utils/helpers.py
@ -13,7 +13,6 @@ import logging
 import traceback
 from datetime import datetime, timedelta
 from typing import Any, Dict, Iterable
 from urllib.parse import quote_plus
 from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
 from metadata.generated.schema.api.services.createDashboardService import (
--- a/ingestion/tests/unit/profiler/test_workflow.py
+++ b/ingestion/tests/unit/profiler/test_workflow.py
@ -21,11 +21,11 @@ from sqlalchemy.orm import declarative_base
 from metadata.generated.schema.api.tests.createColumnTest import CreateColumnTestRequest
 from metadata.generated.schema.api.tests.createTableTest import CreateTableTestRequest
 from metadata.generated.schema.entity.data.table import Column, DataType, Table
-from metadata.generated.schema.metadataIngestion.workflow import (
+from metadata.generated.schema.metadataIngestion.databaseServiceMetadataPipeline import (
-    OpenMetadataServerConfig,
+    DatabaseServiceMetadataPipeline,
 )
 from metadata.generated.schema.metadataIngestion.workflow import (
-    Source as WorkflowSource,
+    OpenMetadataServerConfig,
 )
 from metadata.generated.schema.tests.column.columnValuesToBeBetween import (
    ColumnValuesToBeBetween,
@ -46,10 +46,8 @@ config = {
    "source": {
        "type": "sqlite",
        "serviceName": "my_service",
-        "serviceConnection": {
+        "serviceConnection": {"config": {"type": "SQLite"}},
-            "config": {"type": "SQLite", "hostPort": "", "database": ":memory:"}
+        "sourceConfig": {"config": {}},
        },
        "sourceConfig": {},
    },
    "processor": {"type": "orm-profiler", "config": {}},
    "sink": {"type": "metadata-rest", "config": {}},
@ -68,7 +66,7 @@ def test_init_workflow():
    """
    We can initialise the workflow from a config
    """
-    assert isinstance(workflow.source_config, WorkflowSource)
+    assert isinstance(workflow.source_config, DatabaseServiceMetadataPipeline)
    assert isinstance(workflow.metadata_config, OpenMetadataServerConfig)
    assert isinstance(workflow.processor, OrmProfilerProcessor)
@ -83,29 +81,33 @@ def test_filter_entities():
    """
    service_name = "service"
-    db_reference1 = EntityReference(id=uuid.uuid4(), name="one_db", type="database")
+    schema_reference1 = EntityReference(
-    db_reference2 = EntityReference(id=uuid.uuid4(), name="another_db", type="database")
+        id=uuid.uuid4(), name="one_schema", type="databaseSchema"
    )
    schema_reference2 = EntityReference(
        id=uuid.uuid4(), name="another_schema", type="databaseSchema"
    )
    all_tables = [
        Table(
            id=uuid.uuid4(),
            name="table1",
-            database=db_reference1,
+            databaseSchema=schema_reference1,
-            fullyQualifiedName=f"{service_name}.{db_reference1.name}.table1",
+            fullyQualifiedName=f"{service_name}.db.{schema_reference1.name}.table1",
            columns=[Column(name="id", dataType=DataType.BIGINT)],
        ),
        Table(
            id=uuid.uuid4(),
            name="table2",
-            database=db_reference1,
+            databaseSchema=schema_reference1,
-            fullyQualifiedName=f"{service_name}.{db_reference1.name}.table2",
+            fullyQualifiedName=f"{service_name}.db.{schema_reference1.name}.table2",
            columns=[Column(name="id", dataType=DataType.BIGINT)],
        ),
        Table(
            id=uuid.uuid4(),
            name="table3",
-            database=db_reference2,
+            databaseSchema=schema_reference2,
-            fullyQualifiedName=f"{service_name}.{db_reference2.name}.table3",
+            fullyQualifiedName=f"{service_name}.db.{schema_reference2.name}.table3",
            columns=[Column(name="id", dataType=DataType.BIGINT)],
        ),
    ]
@ -115,9 +117,9 @@ def test_filter_entities():
    # We can exclude based on the schema name
    exclude_filter_schema_config = deepcopy(config)
-    exclude_filter_schema_config["source"]["config"]["schema_filter_pattern"] = {
+    exclude_filter_schema_config["source"]["sourceConfig"]["config"][
-        "excludes": ["one_db"]
+        "schemaFilterPattern"
-    }
+    ] = {"excludes": ["one_schema"]}
    exclude_filter_schema_workflow = ProfilerWorkflow.create(
        exclude_filter_schema_config
@ -126,9 +128,9 @@ def test_filter_entities():
    # We can include based on the schema name
    include_filter_schema_config = deepcopy(config)
-    include_filter_schema_config["source"]["config"]["schema_filter_pattern"] = {
+    include_filter_schema_config["source"]["sourceConfig"]["config"][
-        "includes": ["another_db"]
+        "schemaFilterPattern"
-    }
+    ] = {"includes": ["another_schema"]}
    include_filter_schema_workflow = ProfilerWorkflow.create(
        include_filter_schema_config
@ -137,18 +139,18 @@ def test_filter_entities():
    # We can exclude based on the table name
    exclude_filter_table_config = deepcopy(config)
-    exclude_filter_table_config["source"]["config"]["table_filter_pattern"] = {
+    exclude_filter_table_config["source"]["sourceConfig"]["config"][
-        "excludes": ["tab*"]
+        "tableFilterPattern"
-    }
+    ] = {"excludes": ["tab*"]}
    exclude_filter_table_workflow = ProfilerWorkflow.create(exclude_filter_table_config)
    assert len(list(exclude_filter_table_workflow.filter_entities(all_tables))) == 0
    # We can include based on the table name
    include_filter_table_config = deepcopy(config)
-    include_filter_table_config["source"]["config"]["table_filter_pattern"] = {
+    include_filter_table_config["source"]["sourceConfig"]["config"][
-        "includes": ["table1"]
+        "tableFilterPattern"
-    }
+    ] = {"includes": ["table1"]}
    include_filter_table_workflow = ProfilerWorkflow.create(include_filter_table_config)
    assert len(list(include_filter_table_workflow.filter_entities(all_tables))) == 1