mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-09 17:12:02 +00:00
* fix: extracted profiler object from workflow and implemented factory to allow service base logic * fix: ran python linting * fix: renamed `base` to `base_profiler_source` * fix: add logic to set correct database for BQ multi project ID connections * fix: ran python linting
This commit is contained in:
parent
b67e8f5fc0
commit
ddbc7fe14d
@ -17,15 +17,13 @@ Workflow definition for the ORM Profiler.
|
|||||||
- How to define metrics & tests
|
- How to define metrics & tests
|
||||||
"""
|
"""
|
||||||
import traceback
|
import traceback
|
||||||
from copy import deepcopy
|
from typing import Iterable, Optional, cast
|
||||||
from typing import Iterable, List, Optional, Union, cast
|
|
||||||
|
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
from sqlalchemy import MetaData
|
|
||||||
|
|
||||||
from metadata.config.common import WorkflowExecutionError
|
from metadata.config.common import WorkflowExecutionError
|
||||||
from metadata.generated.schema.entity.data.database import Database
|
from metadata.generated.schema.entity.data.database import Database
|
||||||
from metadata.generated.schema.entity.data.table import ColumnProfilerConfig, Table
|
from metadata.generated.schema.entity.data.table import Table
|
||||||
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
||||||
DatalakeConnection,
|
DatalakeConnection,
|
||||||
)
|
)
|
||||||
@ -53,21 +51,10 @@ from metadata.ingestion.models.custom_types import ServiceWithConnectionType
|
|||||||
from metadata.ingestion.ometa.client_utils import create_ometa_client
|
from metadata.ingestion.ometa.client_utils import create_ometa_client
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.ingestion.source.connections import get_connection, get_test_connection_fn
|
from metadata.ingestion.source.connections import get_connection, get_test_connection_fn
|
||||||
from metadata.profiler.api.models import (
|
from metadata.profiler.api.models import ProfilerProcessorConfig, ProfilerResponse
|
||||||
ProfilerProcessorConfig,
|
|
||||||
ProfilerResponse,
|
|
||||||
TableConfig,
|
|
||||||
)
|
|
||||||
from metadata.profiler.interface.pandas.pandas_profiler_interface import (
|
|
||||||
PandasProfilerInterface,
|
|
||||||
)
|
|
||||||
from metadata.profiler.interface.profiler_protocol import ProfilerProtocol
|
|
||||||
from metadata.profiler.interface.sqlalchemy.sqa_profiler_interface import (
|
|
||||||
SQAProfilerInterface,
|
|
||||||
)
|
|
||||||
from metadata.profiler.metrics.registry import Metrics
|
|
||||||
from metadata.profiler.processor.core import Profiler
|
from metadata.profiler.processor.core import Profiler
|
||||||
from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics
|
from metadata.profiler.source.base_profiler_source import BaseProfilerSource
|
||||||
|
from metadata.profiler.source.profiler_source_factory import profiler_source_factory
|
||||||
from metadata.timer.repeated_timer import RepeatedTimer
|
from metadata.timer.repeated_timer import RepeatedTimer
|
||||||
from metadata.timer.workflow_reporter import get_ingestion_status_timer
|
from metadata.timer.workflow_reporter import get_ingestion_status_timer
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
@ -112,7 +99,7 @@ class ProfilerWorkflow(WorkflowStatusMixin):
|
|||||||
self.profiler_config = ProfilerProcessorConfig.parse_obj(
|
self.profiler_config = ProfilerProcessorConfig.parse_obj(
|
||||||
self.config.processor.dict().get("config")
|
self.config.processor.dict().get("config")
|
||||||
)
|
)
|
||||||
self.metadata = OpenMetadata(self.metadata_config)
|
self.metadata = create_ometa_client(self.metadata_config)
|
||||||
self._retrieve_service_connection_if_needed()
|
self._retrieve_service_connection_if_needed()
|
||||||
self.test_connection()
|
self.test_connection()
|
||||||
self.set_ingestion_pipeline_status(state=PipelineState.running)
|
self.set_ingestion_pipeline_status(state=PipelineState.running)
|
||||||
@ -168,71 +155,6 @@ class ProfilerWorkflow(WorkflowStatusMixin):
|
|||||||
|
|
||||||
return self._timer
|
return self._timer
|
||||||
|
|
||||||
def get_config_for_entity(self, entity: Table) -> Optional[TableConfig]:
|
|
||||||
"""Get config for a specific entity
|
|
||||||
|
|
||||||
Args:
|
|
||||||
entity: table entity
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not self.profiler_config.tableConfig:
|
|
||||||
return None
|
|
||||||
return next(
|
|
||||||
(
|
|
||||||
table_config
|
|
||||||
for table_config in self.profiler_config.tableConfig
|
|
||||||
if table_config.fullyQualifiedName.__root__
|
|
||||||
== entity.fullyQualifiedName.__root__ # type: ignore
|
|
||||||
),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_include_columns(self, entity) -> Optional[List[ColumnProfilerConfig]]:
|
|
||||||
"""get included columns"""
|
|
||||||
entity_config: Optional[TableConfig] = self.get_config_for_entity(entity)
|
|
||||||
if entity_config and entity_config.columnConfig:
|
|
||||||
return entity_config.columnConfig.includeColumns
|
|
||||||
|
|
||||||
if entity.tableProfilerConfig:
|
|
||||||
return entity.tableProfilerConfig.includeColumns
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_exclude_columns(self, entity) -> Optional[List[str]]:
|
|
||||||
"""get included columns"""
|
|
||||||
entity_config: Optional[TableConfig] = self.get_config_for_entity(entity)
|
|
||||||
if entity_config and entity_config.columnConfig:
|
|
||||||
return entity_config.columnConfig.excludeColumns
|
|
||||||
|
|
||||||
if entity.tableProfilerConfig:
|
|
||||||
return entity.tableProfilerConfig.excludeColumns
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def create_profiler(
|
|
||||||
self, table_entity: Table, profiler_interface: ProfilerProtocol
|
|
||||||
):
|
|
||||||
"""Profile a single entity"""
|
|
||||||
if not self.profiler_config.profiler:
|
|
||||||
self.profiler = DefaultProfiler(
|
|
||||||
profiler_interface=profiler_interface,
|
|
||||||
include_columns=self.get_include_columns(table_entity),
|
|
||||||
exclude_columns=self.get_exclude_columns(table_entity),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
metrics = (
|
|
||||||
[Metrics.get(name) for name in self.profiler_config.profiler.metrics]
|
|
||||||
if self.profiler_config.profiler.metrics
|
|
||||||
else get_default_metrics(profiler_interface.table)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.profiler = Profiler(
|
|
||||||
*metrics, # type: ignore
|
|
||||||
profiler_interface=profiler_interface,
|
|
||||||
include_columns=self.get_include_columns(table_entity),
|
|
||||||
exclude_columns=self.get_exclude_columns(table_entity),
|
|
||||||
)
|
|
||||||
|
|
||||||
def filter_databases(self, database: Database) -> Optional[Database]:
|
def filter_databases(self, database: Database) -> Optional[Database]:
|
||||||
"""Returns filtered database entities"""
|
"""Returns filtered database entities"""
|
||||||
if filter_by_database(
|
if filter_by_database(
|
||||||
@ -338,51 +260,17 @@ class ProfilerWorkflow(WorkflowStatusMixin):
|
|||||||
|
|
||||||
yield from self.filter_entities(tables)
|
yield from self.filter_entities(tables)
|
||||||
|
|
||||||
def copy_service_config(self, database) -> DatabaseService.__config__:
|
|
||||||
copy_service_connection_config = deepcopy(
|
|
||||||
self.config.source.serviceConnection.__root__.config # type: ignore
|
|
||||||
)
|
|
||||||
if hasattr(
|
|
||||||
self.config.source.serviceConnection.__root__.config, # type: ignore
|
|
||||||
"supportsDatabase",
|
|
||||||
):
|
|
||||||
if hasattr(copy_service_connection_config, "database"):
|
|
||||||
copy_service_connection_config.database = database.name.__root__ # type: ignore
|
|
||||||
if hasattr(copy_service_connection_config, "catalog"):
|
|
||||||
copy_service_connection_config.catalog = database.name.__root__ # type: ignore
|
|
||||||
|
|
||||||
# we know we'll only be working with databaseServices, we cast the type to satisfy type checker
|
|
||||||
copy_service_connection_config = cast(
|
|
||||||
DatabaseService.__config__, copy_service_connection_config
|
|
||||||
)
|
|
||||||
|
|
||||||
return copy_service_connection_config
|
|
||||||
|
|
||||||
def run_profiler(
|
def run_profiler(
|
||||||
self, entity: Table, copied_service_config, sqa_metadata=None
|
self, entity: Table, profiler_source: BaseProfilerSource
|
||||||
) -> Optional[ProfilerResponse]:
|
) -> Optional[ProfilerResponse]:
|
||||||
"""
|
"""
|
||||||
Main logic for the profiler workflow
|
Main logic for the profiler workflow
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
profiler_interface: Union[
|
profiler_runner: Profiler = profiler_source.get_profiler_runner(
|
||||||
SQAProfilerInterface, PandasProfilerInterface
|
entity, self.profiler_config
|
||||||
] = ProfilerProtocol.create(
|
)
|
||||||
(
|
profile: ProfilerResponse = profiler_runner.process(
|
||||||
copied_service_config.__class__.__name__
|
|
||||||
if isinstance(copied_service_config, NON_SQA_DATABASE_CONNECTIONS)
|
|
||||||
else self.config.source.serviceConnection.__root__.__class__.__name__
|
|
||||||
),
|
|
||||||
entity,
|
|
||||||
self.get_config_for_entity(entity),
|
|
||||||
self.source_config,
|
|
||||||
copied_service_config,
|
|
||||||
create_ometa_client(self.metadata_config),
|
|
||||||
sqa_metadata=sqa_metadata,
|
|
||||||
) # type: ignore
|
|
||||||
self.create_profiler(entity, profiler_interface)
|
|
||||||
self.profiler = cast(Profiler, self.profiler) # satisfy type checker
|
|
||||||
profile: ProfilerResponse = self.profiler.process(
|
|
||||||
self.source_config.generateSampleData,
|
self.source_config.generateSampleData,
|
||||||
self.source_config.processPiiSensitive,
|
self.source_config.processPiiSensitive,
|
||||||
)
|
)
|
||||||
@ -394,18 +282,20 @@ class ProfilerWorkflow(WorkflowStatusMixin):
|
|||||||
self.source_status.failed(name, error, traceback.format_exc())
|
self.source_status.failed(name, error, traceback.format_exc())
|
||||||
try:
|
try:
|
||||||
# if we fail to instantiate a profiler_interface, we won't have a profiler_interface variable
|
# if we fail to instantiate a profiler_interface, we won't have a profiler_interface variable
|
||||||
|
# we'll also catch scenarios where we don't have an interface set
|
||||||
self.source_status.fail_all(
|
self.source_status.fail_all(
|
||||||
profiler_interface.processor_status.failures
|
profiler_source.interface.processor_status.failures
|
||||||
)
|
)
|
||||||
self.source_status.records.extend(
|
self.source_status.records.extend(
|
||||||
profiler_interface.processor_status.records
|
profiler_source.interface.processor_status.records
|
||||||
)
|
)
|
||||||
except UnboundLocalError:
|
except (UnboundLocalError, AttributeError):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
self.source_status.fail_all(profiler_interface.processor_status.failures)
|
# at this point we know we have an interface variable since we the `try` block above didn't raise
|
||||||
|
self.source_status.fail_all(profiler_source.interface.processor_status.failures) # type: ignore
|
||||||
self.source_status.records.extend(
|
self.source_status.records.extend(
|
||||||
profiler_interface.processor_status.records
|
profiler_source.interface.processor_status.records # type: ignore
|
||||||
)
|
)
|
||||||
return profile
|
return profile
|
||||||
|
|
||||||
@ -419,18 +309,14 @@ class ProfilerWorkflow(WorkflowStatusMixin):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
for database in self.get_database_entities():
|
for database in self.get_database_entities():
|
||||||
copied_service_config = self.copy_service_config(database)
|
profiler_source = profiler_source_factory.create(
|
||||||
sqa_metadata = (
|
self.config.source.type.lower(),
|
||||||
MetaData()
|
self.config,
|
||||||
if not isinstance(
|
database,
|
||||||
copied_service_config, NON_SQA_DATABASE_CONNECTIONS
|
self.metadata,
|
||||||
)
|
)
|
||||||
else None
|
|
||||||
) # we only need this for sqlalchemy based services
|
|
||||||
for entity in self.get_table_entities(database=database):
|
for entity in self.get_table_entities(database=database):
|
||||||
profile = self.run_profiler(
|
profile = self.run_profiler(entity, profiler_source)
|
||||||
entity, copied_service_config, sqa_metadata
|
|
||||||
)
|
|
||||||
if hasattr(self, "sink") and profile:
|
if hasattr(self, "sink") and profile:
|
||||||
self.sink.write_record(profile)
|
self.sink.write_record(profile)
|
||||||
# At the end of the `execute`, update the associated Ingestion Pipeline status as success
|
# At the end of the `execute`, update the associated Ingestion Pipeline status as success
|
||||||
|
0
ingestion/src/metadata/profiler/source/__init__.py
Normal file
0
ingestion/src/metadata/profiler/source/__init__.py
Normal file
228
ingestion/src/metadata/profiler/source/base_profiler_source.py
Normal file
228
ingestion/src/metadata/profiler/source/base_profiler_source.py
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Base source for the profiler used to instantiate a profiler runner with
|
||||||
|
its interface
|
||||||
|
"""
|
||||||
|
from copy import deepcopy
|
||||||
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
|
from sqlalchemy import MetaData
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.data.table import ColumnProfilerConfig, Table
|
||||||
|
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
|
||||||
|
DatalakeConnection,
|
||||||
|
)
|
||||||
|
from metadata.generated.schema.entity.services.databaseService import (
|
||||||
|
DatabaseConnection,
|
||||||
|
DatabaseService,
|
||||||
|
)
|
||||||
|
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
|
||||||
|
DatabaseServiceProfilerPipeline,
|
||||||
|
)
|
||||||
|
from metadata.generated.schema.metadataIngestion.workflow import (
|
||||||
|
OpenMetadataWorkflowConfig,
|
||||||
|
)
|
||||||
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig
|
||||||
|
from metadata.profiler.interface.pandas.pandas_profiler_interface import (
|
||||||
|
PandasProfilerInterface,
|
||||||
|
)
|
||||||
|
from metadata.profiler.interface.profiler_protocol import ProfilerProtocol
|
||||||
|
from metadata.profiler.interface.sqlalchemy.sqa_profiler_interface import (
|
||||||
|
SQAProfilerInterface,
|
||||||
|
)
|
||||||
|
from metadata.profiler.metrics.registry import Metrics
|
||||||
|
from metadata.profiler.processor.core import Profiler
|
||||||
|
from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics
|
||||||
|
|
||||||
|
NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseProfilerSource:
|
||||||
|
"""
|
||||||
|
Base class for the profiler source
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: OpenMetadataWorkflowConfig,
|
||||||
|
database: DatabaseService,
|
||||||
|
ometa_client: OpenMetadata,
|
||||||
|
):
|
||||||
|
self.service_conn_config = self._copy_service_config(config, database)
|
||||||
|
self.source_config = config.source.sourceConfig.config
|
||||||
|
self.source_config = cast(
|
||||||
|
DatabaseServiceProfilerPipeline, self.source_config
|
||||||
|
) # satisfy type checker
|
||||||
|
self.profiler_config = ProfilerProcessorConfig.parse_obj(
|
||||||
|
config.processor.dict().get("config")
|
||||||
|
)
|
||||||
|
self.ometa_client = ometa_client
|
||||||
|
self.profiler_interface_type: str = self._get_profiler_interface_type(config)
|
||||||
|
self.sqa_metadata = self._set_sqa_metadata()
|
||||||
|
self._interface = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def interface(
|
||||||
|
self,
|
||||||
|
) -> Optional[Union[SQAProfilerInterface, PandasProfilerInterface]]:
|
||||||
|
"""Get the interface"""
|
||||||
|
return self._interface
|
||||||
|
|
||||||
|
@interface.setter
|
||||||
|
def interface(self, interface):
|
||||||
|
"""Set the interface"""
|
||||||
|
self._interface = interface
|
||||||
|
|
||||||
|
def _set_sqa_metadata(self):
|
||||||
|
"""Set sqlalchemy metadata"""
|
||||||
|
if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS):
|
||||||
|
return MetaData()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_profiler_interface_type(self, config) -> str:
|
||||||
|
"""_summary_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (_type_): profiler config
|
||||||
|
Returns:
|
||||||
|
str:
|
||||||
|
"""
|
||||||
|
if isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS):
|
||||||
|
return self.service_conn_config.__class__.__name__
|
||||||
|
return config.source.serviceConnection.__root__.__class__.__name__
|
||||||
|
|
||||||
|
def _get_config_for_table(
|
||||||
|
self, entity: Table, profiler_config
|
||||||
|
) -> Optional[TableConfig]:
|
||||||
|
"""Get config for a specific entity
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entity: table entity
|
||||||
|
"""
|
||||||
|
if not profiler_config.tableConfig:
|
||||||
|
return None
|
||||||
|
return next(
|
||||||
|
(
|
||||||
|
table_config
|
||||||
|
for table_config in profiler_config.tableConfig
|
||||||
|
if table_config.fullyQualifiedName.__root__
|
||||||
|
== entity.fullyQualifiedName.__root__ # type: ignore
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_include_columns(
|
||||||
|
self, entity, entity_config: Optional[TableConfig]
|
||||||
|
) -> Optional[List[ColumnProfilerConfig]]:
|
||||||
|
"""get included columns"""
|
||||||
|
if entity_config and entity_config.columnConfig:
|
||||||
|
return entity_config.columnConfig.includeColumns
|
||||||
|
|
||||||
|
if entity.tableProfilerConfig:
|
||||||
|
return entity.tableProfilerConfig.includeColumns
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_exclude_columns(
|
||||||
|
self, entity, entity_config: Optional[TableConfig]
|
||||||
|
) -> Optional[List[str]]:
|
||||||
|
"""get included columns"""
|
||||||
|
if entity_config and entity_config.columnConfig:
|
||||||
|
return entity_config.columnConfig.excludeColumns
|
||||||
|
|
||||||
|
if entity.tableProfilerConfig:
|
||||||
|
return entity.tableProfilerConfig.excludeColumns
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _copy_service_config(
|
||||||
|
self, config: OpenMetadataWorkflowConfig, database: DatabaseService
|
||||||
|
) -> DatabaseConnection:
|
||||||
|
"""Make a copy of the service config and update the database name
|
||||||
|
|
||||||
|
Args:
|
||||||
|
database (_type_): a database entity
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DatabaseService.__config__
|
||||||
|
"""
|
||||||
|
config_copy = deepcopy(
|
||||||
|
config.source.serviceConnection.__root__.config # type: ignore
|
||||||
|
)
|
||||||
|
if hasattr(
|
||||||
|
config_copy, # type: ignore
|
||||||
|
"supportsDatabase",
|
||||||
|
):
|
||||||
|
if hasattr(config_copy, "database"):
|
||||||
|
config_copy.database = database.name.__root__ # type: ignore
|
||||||
|
if hasattr(config_copy, "catalog"):
|
||||||
|
config_copy.catalog = database.name.__root__ # type: ignore
|
||||||
|
|
||||||
|
# we know we'll only be working with DatabaseConnection, we cast the type to satisfy type checker
|
||||||
|
config_copy = cast(DatabaseConnection, config_copy)
|
||||||
|
|
||||||
|
return config_copy
|
||||||
|
|
||||||
|
def create_profiler_interface(
|
||||||
|
self,
|
||||||
|
entity: Table,
|
||||||
|
table_config: Optional[TableConfig],
|
||||||
|
) -> Union[SQAProfilerInterface, PandasProfilerInterface]:
|
||||||
|
"""Create sqlalchemy profiler interface"""
|
||||||
|
profiler_interface: Union[
|
||||||
|
SQAProfilerInterface, PandasProfilerInterface
|
||||||
|
] = ProfilerProtocol.create(
|
||||||
|
self.profiler_interface_type,
|
||||||
|
entity,
|
||||||
|
table_config,
|
||||||
|
self.source_config,
|
||||||
|
self.service_conn_config,
|
||||||
|
self.ometa_client,
|
||||||
|
sqa_metadata=self.sqa_metadata,
|
||||||
|
) # type: ignore
|
||||||
|
|
||||||
|
self.interface = profiler_interface
|
||||||
|
return self.interface
|
||||||
|
|
||||||
|
def get_profiler_runner(
|
||||||
|
self, entity: Table, profiler_config: ProfilerProcessorConfig
|
||||||
|
) -> Profiler:
|
||||||
|
"""
|
||||||
|
Returns the runner for the profiler
|
||||||
|
"""
|
||||||
|
table_config = self._get_config_for_table(entity, profiler_config)
|
||||||
|
profiler_interface = self.create_profiler_interface(
|
||||||
|
entity,
|
||||||
|
table_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not profiler_config.profiler:
|
||||||
|
return DefaultProfiler(
|
||||||
|
profiler_interface=profiler_interface,
|
||||||
|
include_columns=self._get_include_columns(entity, table_config),
|
||||||
|
exclude_columns=self._get_exclude_columns(entity, table_config),
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = (
|
||||||
|
[Metrics.get(name) for name in profiler_config.profiler.metrics]
|
||||||
|
if profiler_config.profiler.metrics
|
||||||
|
else get_default_metrics(profiler_interface.table)
|
||||||
|
)
|
||||||
|
|
||||||
|
return Profiler(
|
||||||
|
*metrics, # type: ignore
|
||||||
|
profiler_interface=profiler_interface,
|
||||||
|
include_columns=self._get_include_columns(entity, table_config),
|
||||||
|
exclude_columns=self._get_exclude_columns(entity, table_config),
|
||||||
|
)
|
@ -0,0 +1,61 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Bigquery Profiler source
|
||||||
|
"""
|
||||||
|
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import (
|
||||||
|
BigQueryConnection,
|
||||||
|
)
|
||||||
|
from metadata.generated.schema.entity.services.databaseService import DatabaseService
|
||||||
|
from metadata.generated.schema.metadataIngestion.workflow import (
|
||||||
|
OpenMetadataWorkflowConfig,
|
||||||
|
)
|
||||||
|
from metadata.generated.schema.security.credentials.gcsValues import (
|
||||||
|
GcsCredentialsValues,
|
||||||
|
MultipleProjectId,
|
||||||
|
SingleProjectId,
|
||||||
|
)
|
||||||
|
from metadata.profiler.source.base_profiler_source import BaseProfilerSource
|
||||||
|
|
||||||
|
|
||||||
|
class BigQueryProfilerSource(BaseProfilerSource):
|
||||||
|
"""override the base profiler source to handle BigQuery specific connection configs"""
|
||||||
|
|
||||||
|
def _copy_service_config(
|
||||||
|
self, config: OpenMetadataWorkflowConfig, database: DatabaseService
|
||||||
|
) -> BigQueryConnection:
|
||||||
|
"""Make a copy of the database connection config. If MultiProjectId is used, replace it
|
||||||
|
with SingleProjectId with the database name being profiled. We iterate over all non filtered
|
||||||
|
database in workflow.py `def execute`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
database (DatabaseService): a database entity
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DatabaseConnection
|
||||||
|
"""
|
||||||
|
config_copy: BigQueryConnection = deepcopy(
|
||||||
|
config.source.serviceConnection.__root__.config # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(config_copy.credentials.gcsConfig, GcsCredentialsValues):
|
||||||
|
if isinstance(
|
||||||
|
config_copy.credentials.gcsConfig.projectId, MultipleProjectId
|
||||||
|
):
|
||||||
|
config_copy.credentials.gcsConfig.projectId = SingleProjectId(
|
||||||
|
__root__=database.name.__root__
|
||||||
|
)
|
||||||
|
|
||||||
|
return config_copy
|
@ -0,0 +1,45 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Factory class for creating profiler source objects
|
||||||
|
"""
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import (
|
||||||
|
BigqueryType,
|
||||||
|
)
|
||||||
|
from metadata.profiler.source.base_profiler_source import BaseProfilerSource
|
||||||
|
from metadata.profiler.source.bigquery.profiler_source import BigQueryProfilerSource
|
||||||
|
|
||||||
|
|
||||||
|
class ProfilerSourceFactory:
|
||||||
|
"""Creational factory for profiler source objects"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._source_type = {"base": BaseProfilerSource}
|
||||||
|
|
||||||
|
def register_source(self, source_type: str, source_class):
|
||||||
|
"""Register a new source type"""
|
||||||
|
self._source_type[source_type] = source_class
|
||||||
|
|
||||||
|
def create(self, source_type: str, *args, **kwargs) -> BaseProfilerSource:
|
||||||
|
"""Create source object based on source type"""
|
||||||
|
source_class = self._source_type.get(source_type)
|
||||||
|
if not source_class:
|
||||||
|
source_class = self._source_type["base"]
|
||||||
|
return source_class(*args, **kwargs)
|
||||||
|
return source_class(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
profiler_source_factory = ProfilerSourceFactory()
|
||||||
|
profiler_source_factory.register_source(
|
||||||
|
BigqueryType.BigQuery.value.lower(), BigQueryProfilerSource
|
||||||
|
)
|
@ -30,7 +30,11 @@ from metadata.generated.schema.entity.data.table import (
|
|||||||
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
|
||||||
OpenMetadataConnection,
|
OpenMetadataConnection,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.entity.services.databaseService import DatabaseConnection
|
from metadata.generated.schema.entity.services.databaseService import (
|
||||||
|
DatabaseConnection,
|
||||||
|
DatabaseService,
|
||||||
|
DatabaseServiceType,
|
||||||
|
)
|
||||||
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
|
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
|
||||||
DatabaseServiceProfilerPipeline,
|
DatabaseServiceProfilerPipeline,
|
||||||
)
|
)
|
||||||
@ -42,6 +46,7 @@ from metadata.profiler.interface.sqlalchemy.sqa_profiler_interface import (
|
|||||||
SQAProfilerInterface,
|
SQAProfilerInterface,
|
||||||
)
|
)
|
||||||
from metadata.profiler.processor.default import DefaultProfiler
|
from metadata.profiler.processor.default import DefaultProfiler
|
||||||
|
from metadata.profiler.source.base_profiler_source import BaseProfilerSource
|
||||||
|
|
||||||
TABLE = Table(
|
TABLE = Table(
|
||||||
id=uuid.uuid4(),
|
id=uuid.uuid4(),
|
||||||
@ -228,20 +233,21 @@ def test_profile_def(mocked_method, mocked_orm): # pylint: disable=unused-argum
|
|||||||
profile_workflow = ProfilerWorkflow.create(profile_config)
|
profile_workflow = ProfilerWorkflow.create(profile_config)
|
||||||
mocked_method.assert_called()
|
mocked_method.assert_called()
|
||||||
|
|
||||||
profiler_interface: SQAProfilerInterface = ProfilerProtocol.create(
|
profiler_source = BaseProfilerSource(
|
||||||
_profiler_type=DatabaseConnection.__name__,
|
profile_workflow.config,
|
||||||
entity=TABLE,
|
DatabaseService(
|
||||||
entity_config=profile_workflow.get_config_for_entity(TABLE),
|
id=uuid.uuid4(),
|
||||||
source_config=profile_workflow.source_config,
|
name="myDataBaseService",
|
||||||
service_connection_config=profile_workflow.config.source.serviceConnection.__root__.config,
|
serviceType=DatabaseServiceType.SQLite,
|
||||||
ometa_client=None,
|
), # type: ignore
|
||||||
sqa_metadata=MetaData(),
|
profile_workflow.metadata,
|
||||||
|
)
|
||||||
|
profiler_runner = profiler_source.get_profiler_runner(
|
||||||
|
TABLE, profile_workflow.profiler_config
|
||||||
)
|
)
|
||||||
|
|
||||||
profile_workflow.create_profiler(TABLE, profiler_interface)
|
# profile_workflow.create_profiler(TABLE, profiler_interface)
|
||||||
profiler_obj_metrics = [
|
profiler_obj_metrics = [metric.name() for metric in profiler_runner.metrics]
|
||||||
metric.name() for metric in profile_workflow.profiler.metrics
|
|
||||||
]
|
|
||||||
|
|
||||||
assert profile_workflow.profiler_config.profiler
|
assert profile_workflow.profiler_config.profiler
|
||||||
assert config_metrics_label == profiler_obj_metrics
|
assert config_metrics_label == profiler_obj_metrics
|
||||||
@ -268,20 +274,21 @@ def test_default_profile_def(
|
|||||||
profile_workflow = ProfilerWorkflow.create(config)
|
profile_workflow = ProfilerWorkflow.create(config)
|
||||||
mocked_method.assert_called()
|
mocked_method.assert_called()
|
||||||
|
|
||||||
profiler_interface: SQAProfilerInterface = ProfilerProtocol.create(
|
profiler_source = BaseProfilerSource(
|
||||||
_profiler_type=DatabaseConnection.__name__,
|
profile_workflow.config,
|
||||||
entity=TABLE,
|
DatabaseService(
|
||||||
entity_config=profile_workflow.get_config_for_entity(TABLE),
|
id=uuid.uuid4(),
|
||||||
source_config=profile_workflow.source_config,
|
name="myDataBaseService",
|
||||||
service_connection_config=profile_workflow.config.source.serviceConnection.__root__.config,
|
serviceType=DatabaseServiceType.SQLite,
|
||||||
ometa_client=None,
|
), # type: ignore
|
||||||
sqa_metadata=MetaData(),
|
profile_workflow.metadata,
|
||||||
|
)
|
||||||
|
profiler_runner = profiler_source.get_profiler_runner(
|
||||||
|
TABLE, profile_workflow.profiler_config
|
||||||
)
|
)
|
||||||
|
|
||||||
profile_workflow.create_profiler(TABLE, profiler_interface)
|
|
||||||
|
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
profile_workflow.profiler,
|
profiler_runner,
|
||||||
DefaultProfiler,
|
DefaultProfiler,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user