mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 02:29:03 +00:00 
			
		
		
		
	* fix: extracted profiler object from workflow and implemented factory to allow service base logic * fix: ran python linting * fix: renamed `base` to `base_profiler_source` * fix: add logic to set correct database for BQ multi project ID connections * fix: ran python linting
This commit is contained in:
		
							parent
							
								
									b67e8f5fc0
								
							
						
					
					
						commit
						ddbc7fe14d
					
				| @ -17,15 +17,13 @@ Workflow definition for the ORM Profiler. | |||||||
| - How to define metrics & tests | - How to define metrics & tests | ||||||
| """ | """ | ||||||
| import traceback | import traceback | ||||||
| from copy import deepcopy | from typing import Iterable, Optional, cast | ||||||
| from typing import Iterable, List, Optional, Union, cast |  | ||||||
| 
 | 
 | ||||||
| from pydantic import ValidationError | from pydantic import ValidationError | ||||||
| from sqlalchemy import MetaData |  | ||||||
| 
 | 
 | ||||||
| from metadata.config.common import WorkflowExecutionError | from metadata.config.common import WorkflowExecutionError | ||||||
| from metadata.generated.schema.entity.data.database import Database | from metadata.generated.schema.entity.data.database import Database | ||||||
| from metadata.generated.schema.entity.data.table import ColumnProfilerConfig, Table | from metadata.generated.schema.entity.data.table import Table | ||||||
| from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( | from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( | ||||||
|     DatalakeConnection, |     DatalakeConnection, | ||||||
| ) | ) | ||||||
| @ -53,21 +51,10 @@ from metadata.ingestion.models.custom_types import ServiceWithConnectionType | |||||||
| from metadata.ingestion.ometa.client_utils import create_ometa_client | from metadata.ingestion.ometa.client_utils import create_ometa_client | ||||||
| from metadata.ingestion.ometa.ometa_api import OpenMetadata | from metadata.ingestion.ometa.ometa_api import OpenMetadata | ||||||
| from metadata.ingestion.source.connections import get_connection, get_test_connection_fn | from metadata.ingestion.source.connections import get_connection, get_test_connection_fn | ||||||
| from metadata.profiler.api.models import ( | from metadata.profiler.api.models import ProfilerProcessorConfig, ProfilerResponse | ||||||
|     ProfilerProcessorConfig, |  | ||||||
|     ProfilerResponse, |  | ||||||
|     TableConfig, |  | ||||||
| ) |  | ||||||
| from metadata.profiler.interface.pandas.pandas_profiler_interface import ( |  | ||||||
|     PandasProfilerInterface, |  | ||||||
| ) |  | ||||||
| from metadata.profiler.interface.profiler_protocol import ProfilerProtocol |  | ||||||
| from metadata.profiler.interface.sqlalchemy.sqa_profiler_interface import ( |  | ||||||
|     SQAProfilerInterface, |  | ||||||
| ) |  | ||||||
| from metadata.profiler.metrics.registry import Metrics |  | ||||||
| from metadata.profiler.processor.core import Profiler | from metadata.profiler.processor.core import Profiler | ||||||
| from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics | from metadata.profiler.source.base_profiler_source import BaseProfilerSource | ||||||
|  | from metadata.profiler.source.profiler_source_factory import profiler_source_factory | ||||||
| from metadata.timer.repeated_timer import RepeatedTimer | from metadata.timer.repeated_timer import RepeatedTimer | ||||||
| from metadata.timer.workflow_reporter import get_ingestion_status_timer | from metadata.timer.workflow_reporter import get_ingestion_status_timer | ||||||
| from metadata.utils import fqn | from metadata.utils import fqn | ||||||
| @ -112,7 +99,7 @@ class ProfilerWorkflow(WorkflowStatusMixin): | |||||||
|         self.profiler_config = ProfilerProcessorConfig.parse_obj( |         self.profiler_config = ProfilerProcessorConfig.parse_obj( | ||||||
|             self.config.processor.dict().get("config") |             self.config.processor.dict().get("config") | ||||||
|         ) |         ) | ||||||
|         self.metadata = OpenMetadata(self.metadata_config) |         self.metadata = create_ometa_client(self.metadata_config) | ||||||
|         self._retrieve_service_connection_if_needed() |         self._retrieve_service_connection_if_needed() | ||||||
|         self.test_connection() |         self.test_connection() | ||||||
|         self.set_ingestion_pipeline_status(state=PipelineState.running) |         self.set_ingestion_pipeline_status(state=PipelineState.running) | ||||||
| @ -168,71 +155,6 @@ class ProfilerWorkflow(WorkflowStatusMixin): | |||||||
| 
 | 
 | ||||||
|         return self._timer |         return self._timer | ||||||
| 
 | 
 | ||||||
|     def get_config_for_entity(self, entity: Table) -> Optional[TableConfig]: |  | ||||||
|         """Get config for a specific entity |  | ||||||
| 
 |  | ||||||
|         Args: |  | ||||||
|             entity: table entity |  | ||||||
|         """ |  | ||||||
| 
 |  | ||||||
|         if not self.profiler_config.tableConfig: |  | ||||||
|             return None |  | ||||||
|         return next( |  | ||||||
|             ( |  | ||||||
|                 table_config |  | ||||||
|                 for table_config in self.profiler_config.tableConfig |  | ||||||
|                 if table_config.fullyQualifiedName.__root__ |  | ||||||
|                 == entity.fullyQualifiedName.__root__  # type: ignore |  | ||||||
|             ), |  | ||||||
|             None, |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|     def get_include_columns(self, entity) -> Optional[List[ColumnProfilerConfig]]: |  | ||||||
|         """get included columns""" |  | ||||||
|         entity_config: Optional[TableConfig] = self.get_config_for_entity(entity) |  | ||||||
|         if entity_config and entity_config.columnConfig: |  | ||||||
|             return entity_config.columnConfig.includeColumns |  | ||||||
| 
 |  | ||||||
|         if entity.tableProfilerConfig: |  | ||||||
|             return entity.tableProfilerConfig.includeColumns |  | ||||||
| 
 |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
|     def get_exclude_columns(self, entity) -> Optional[List[str]]: |  | ||||||
|         """get included columns""" |  | ||||||
|         entity_config: Optional[TableConfig] = self.get_config_for_entity(entity) |  | ||||||
|         if entity_config and entity_config.columnConfig: |  | ||||||
|             return entity_config.columnConfig.excludeColumns |  | ||||||
| 
 |  | ||||||
|         if entity.tableProfilerConfig: |  | ||||||
|             return entity.tableProfilerConfig.excludeColumns |  | ||||||
| 
 |  | ||||||
|         return None |  | ||||||
| 
 |  | ||||||
|     def create_profiler( |  | ||||||
|         self, table_entity: Table, profiler_interface: ProfilerProtocol |  | ||||||
|     ): |  | ||||||
|         """Profile a single entity""" |  | ||||||
|         if not self.profiler_config.profiler: |  | ||||||
|             self.profiler = DefaultProfiler( |  | ||||||
|                 profiler_interface=profiler_interface, |  | ||||||
|                 include_columns=self.get_include_columns(table_entity), |  | ||||||
|                 exclude_columns=self.get_exclude_columns(table_entity), |  | ||||||
|             ) |  | ||||||
|         else: |  | ||||||
|             metrics = ( |  | ||||||
|                 [Metrics.get(name) for name in self.profiler_config.profiler.metrics] |  | ||||||
|                 if self.profiler_config.profiler.metrics |  | ||||||
|                 else get_default_metrics(profiler_interface.table) |  | ||||||
|             ) |  | ||||||
| 
 |  | ||||||
|             self.profiler = Profiler( |  | ||||||
|                 *metrics,  # type: ignore |  | ||||||
|                 profiler_interface=profiler_interface, |  | ||||||
|                 include_columns=self.get_include_columns(table_entity), |  | ||||||
|                 exclude_columns=self.get_exclude_columns(table_entity), |  | ||||||
|             ) |  | ||||||
| 
 |  | ||||||
|     def filter_databases(self, database: Database) -> Optional[Database]: |     def filter_databases(self, database: Database) -> Optional[Database]: | ||||||
|         """Returns filtered database entities""" |         """Returns filtered database entities""" | ||||||
|         if filter_by_database( |         if filter_by_database( | ||||||
| @ -338,51 +260,17 @@ class ProfilerWorkflow(WorkflowStatusMixin): | |||||||
| 
 | 
 | ||||||
|         yield from self.filter_entities(tables) |         yield from self.filter_entities(tables) | ||||||
| 
 | 
 | ||||||
|     def copy_service_config(self, database) -> DatabaseService.__config__: |  | ||||||
|         copy_service_connection_config = deepcopy( |  | ||||||
|             self.config.source.serviceConnection.__root__.config  # type: ignore |  | ||||||
|         ) |  | ||||||
|         if hasattr( |  | ||||||
|             self.config.source.serviceConnection.__root__.config,  # type: ignore |  | ||||||
|             "supportsDatabase", |  | ||||||
|         ): |  | ||||||
|             if hasattr(copy_service_connection_config, "database"): |  | ||||||
|                 copy_service_connection_config.database = database.name.__root__  # type: ignore |  | ||||||
|             if hasattr(copy_service_connection_config, "catalog"): |  | ||||||
|                 copy_service_connection_config.catalog = database.name.__root__  # type: ignore |  | ||||||
| 
 |  | ||||||
|         # we know we'll only be working with databaseServices, we cast the type to satisfy type checker |  | ||||||
|         copy_service_connection_config = cast( |  | ||||||
|             DatabaseService.__config__, copy_service_connection_config |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|         return copy_service_connection_config |  | ||||||
| 
 |  | ||||||
|     def run_profiler( |     def run_profiler( | ||||||
|         self, entity: Table, copied_service_config, sqa_metadata=None |         self, entity: Table, profiler_source: BaseProfilerSource | ||||||
|     ) -> Optional[ProfilerResponse]: |     ) -> Optional[ProfilerResponse]: | ||||||
|         """ |         """ | ||||||
|         Main logic for the profiler workflow |         Main logic for the profiler workflow | ||||||
|         """ |         """ | ||||||
|         try: |         try: | ||||||
|             profiler_interface: Union[ |             profiler_runner: Profiler = profiler_source.get_profiler_runner( | ||||||
|                 SQAProfilerInterface, PandasProfilerInterface |                 entity, self.profiler_config | ||||||
|             ] = ProfilerProtocol.create( |             ) | ||||||
|                 ( |             profile: ProfilerResponse = profiler_runner.process( | ||||||
|                     copied_service_config.__class__.__name__ |  | ||||||
|                     if isinstance(copied_service_config, NON_SQA_DATABASE_CONNECTIONS) |  | ||||||
|                     else self.config.source.serviceConnection.__root__.__class__.__name__ |  | ||||||
|                 ), |  | ||||||
|                 entity, |  | ||||||
|                 self.get_config_for_entity(entity), |  | ||||||
|                 self.source_config, |  | ||||||
|                 copied_service_config, |  | ||||||
|                 create_ometa_client(self.metadata_config), |  | ||||||
|                 sqa_metadata=sqa_metadata, |  | ||||||
|             )  # type: ignore |  | ||||||
|             self.create_profiler(entity, profiler_interface) |  | ||||||
|             self.profiler = cast(Profiler, self.profiler)  # satisfy type checker |  | ||||||
|             profile: ProfilerResponse = self.profiler.process( |  | ||||||
|                 self.source_config.generateSampleData, |                 self.source_config.generateSampleData, | ||||||
|                 self.source_config.processPiiSensitive, |                 self.source_config.processPiiSensitive, | ||||||
|             ) |             ) | ||||||
| @ -394,18 +282,20 @@ class ProfilerWorkflow(WorkflowStatusMixin): | |||||||
|             self.source_status.failed(name, error, traceback.format_exc()) |             self.source_status.failed(name, error, traceback.format_exc()) | ||||||
|             try: |             try: | ||||||
|                 # if we fail to instantiate a profiler_interface, we won't have a profiler_interface variable |                 # if we fail to instantiate a profiler_interface, we won't have a profiler_interface variable | ||||||
|  |                 # we'll also catch scenarios where we don't have an interface set | ||||||
|                 self.source_status.fail_all( |                 self.source_status.fail_all( | ||||||
|                     profiler_interface.processor_status.failures |                     profiler_source.interface.processor_status.failures | ||||||
|                 ) |                 ) | ||||||
|                 self.source_status.records.extend( |                 self.source_status.records.extend( | ||||||
|                     profiler_interface.processor_status.records |                     profiler_source.interface.processor_status.records | ||||||
|                 ) |                 ) | ||||||
|             except UnboundLocalError: |             except (UnboundLocalError, AttributeError): | ||||||
|                 pass |                 pass | ||||||
|         else: |         else: | ||||||
|             self.source_status.fail_all(profiler_interface.processor_status.failures) |             # at this point we know we have an interface variable since we the `try` block above didn't raise | ||||||
|  |             self.source_status.fail_all(profiler_source.interface.processor_status.failures)  # type: ignore | ||||||
|             self.source_status.records.extend( |             self.source_status.records.extend( | ||||||
|                 profiler_interface.processor_status.records |                 profiler_source.interface.processor_status.records  # type: ignore | ||||||
|             ) |             ) | ||||||
|             return profile |             return profile | ||||||
| 
 | 
 | ||||||
| @ -419,18 +309,14 @@ class ProfilerWorkflow(WorkflowStatusMixin): | |||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             for database in self.get_database_entities(): |             for database in self.get_database_entities(): | ||||||
|                 copied_service_config = self.copy_service_config(database) |                 profiler_source = profiler_source_factory.create( | ||||||
|                 sqa_metadata = ( |                     self.config.source.type.lower(), | ||||||
|                     MetaData() |                     self.config, | ||||||
|                     if not isinstance( |                     database, | ||||||
|                         copied_service_config, NON_SQA_DATABASE_CONNECTIONS |                     self.metadata, | ||||||
|                 ) |                 ) | ||||||
|                     else None |  | ||||||
|                 )  # we only need this for sqlalchemy based services |  | ||||||
|                 for entity in self.get_table_entities(database=database): |                 for entity in self.get_table_entities(database=database): | ||||||
|                     profile = self.run_profiler( |                     profile = self.run_profiler(entity, profiler_source) | ||||||
|                         entity, copied_service_config, sqa_metadata |  | ||||||
|                     ) |  | ||||||
|                     if hasattr(self, "sink") and profile: |                     if hasattr(self, "sink") and profile: | ||||||
|                         self.sink.write_record(profile) |                         self.sink.write_record(profile) | ||||||
|             # At the end of the `execute`, update the associated Ingestion Pipeline status as success |             # At the end of the `execute`, update the associated Ingestion Pipeline status as success | ||||||
|  | |||||||
							
								
								
									
										0
									
								
								ingestion/src/metadata/profiler/source/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								ingestion/src/metadata/profiler/source/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										228
									
								
								ingestion/src/metadata/profiler/source/base_profiler_source.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										228
									
								
								ingestion/src/metadata/profiler/source/base_profiler_source.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,228 @@ | |||||||
|  | #  Copyright 2021 Collate | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Base source for the profiler used to instantiate a profiler runner with | ||||||
|  | its interface | ||||||
|  | """ | ||||||
|  | from copy import deepcopy | ||||||
|  | from typing import List, Optional, Union, cast | ||||||
|  | 
 | ||||||
|  | from sqlalchemy import MetaData | ||||||
|  | 
 | ||||||
|  | from metadata.generated.schema.entity.data.table import ColumnProfilerConfig, Table | ||||||
|  | from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( | ||||||
|  |     DatalakeConnection, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.entity.services.databaseService import ( | ||||||
|  |     DatabaseConnection, | ||||||
|  |     DatabaseService, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( | ||||||
|  |     DatabaseServiceProfilerPipeline, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.metadataIngestion.workflow import ( | ||||||
|  |     OpenMetadataWorkflowConfig, | ||||||
|  | ) | ||||||
|  | from metadata.ingestion.ometa.ometa_api import OpenMetadata | ||||||
|  | from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig | ||||||
|  | from metadata.profiler.interface.pandas.pandas_profiler_interface import ( | ||||||
|  |     PandasProfilerInterface, | ||||||
|  | ) | ||||||
|  | from metadata.profiler.interface.profiler_protocol import ProfilerProtocol | ||||||
|  | from metadata.profiler.interface.sqlalchemy.sqa_profiler_interface import ( | ||||||
|  |     SQAProfilerInterface, | ||||||
|  | ) | ||||||
|  | from metadata.profiler.metrics.registry import Metrics | ||||||
|  | from metadata.profiler.processor.core import Profiler | ||||||
|  | from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics | ||||||
|  | 
 | ||||||
|  | NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class BaseProfilerSource: | ||||||
|  |     """ | ||||||
|  |     Base class for the profiler source | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         config: OpenMetadataWorkflowConfig, | ||||||
|  |         database: DatabaseService, | ||||||
|  |         ometa_client: OpenMetadata, | ||||||
|  |     ): | ||||||
|  |         self.service_conn_config = self._copy_service_config(config, database) | ||||||
|  |         self.source_config = config.source.sourceConfig.config | ||||||
|  |         self.source_config = cast( | ||||||
|  |             DatabaseServiceProfilerPipeline, self.source_config | ||||||
|  |         )  # satisfy type checker | ||||||
|  |         self.profiler_config = ProfilerProcessorConfig.parse_obj( | ||||||
|  |             config.processor.dict().get("config") | ||||||
|  |         ) | ||||||
|  |         self.ometa_client = ometa_client | ||||||
|  |         self.profiler_interface_type: str = self._get_profiler_interface_type(config) | ||||||
|  |         self.sqa_metadata = self._set_sqa_metadata() | ||||||
|  |         self._interface = None | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def interface( | ||||||
|  |         self, | ||||||
|  |     ) -> Optional[Union[SQAProfilerInterface, PandasProfilerInterface]]: | ||||||
|  |         """Get the interface""" | ||||||
|  |         return self._interface | ||||||
|  | 
 | ||||||
|  |     @interface.setter | ||||||
|  |     def interface(self, interface): | ||||||
|  |         """Set the interface""" | ||||||
|  |         self._interface = interface | ||||||
|  | 
 | ||||||
|  |     def _set_sqa_metadata(self): | ||||||
|  |         """Set sqlalchemy metadata""" | ||||||
|  |         if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): | ||||||
|  |             return MetaData() | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def _get_profiler_interface_type(self, config) -> str: | ||||||
|  |         """_summary_ | ||||||
|  | 
 | ||||||
|  |         Args: | ||||||
|  |             config (_type_): profiler config | ||||||
|  |         Returns: | ||||||
|  |             str: | ||||||
|  |         """ | ||||||
|  |         if isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): | ||||||
|  |             return self.service_conn_config.__class__.__name__ | ||||||
|  |         return config.source.serviceConnection.__root__.__class__.__name__ | ||||||
|  | 
 | ||||||
|  |     def _get_config_for_table( | ||||||
|  |         self, entity: Table, profiler_config | ||||||
|  |     ) -> Optional[TableConfig]: | ||||||
|  |         """Get config for a specific entity | ||||||
|  | 
 | ||||||
|  |         Args: | ||||||
|  |             entity: table entity | ||||||
|  |         """ | ||||||
|  |         if not profiler_config.tableConfig: | ||||||
|  |             return None | ||||||
|  |         return next( | ||||||
|  |             ( | ||||||
|  |                 table_config | ||||||
|  |                 for table_config in profiler_config.tableConfig | ||||||
|  |                 if table_config.fullyQualifiedName.__root__ | ||||||
|  |                 == entity.fullyQualifiedName.__root__  # type: ignore | ||||||
|  |             ), | ||||||
|  |             None, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def _get_include_columns( | ||||||
|  |         self, entity, entity_config: Optional[TableConfig] | ||||||
|  |     ) -> Optional[List[ColumnProfilerConfig]]: | ||||||
|  |         """get included columns""" | ||||||
|  |         if entity_config and entity_config.columnConfig: | ||||||
|  |             return entity_config.columnConfig.includeColumns | ||||||
|  | 
 | ||||||
|  |         if entity.tableProfilerConfig: | ||||||
|  |             return entity.tableProfilerConfig.includeColumns | ||||||
|  | 
 | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def _get_exclude_columns( | ||||||
|  |         self, entity, entity_config: Optional[TableConfig] | ||||||
|  |     ) -> Optional[List[str]]: | ||||||
|  |         """get included columns""" | ||||||
|  |         if entity_config and entity_config.columnConfig: | ||||||
|  |             return entity_config.columnConfig.excludeColumns | ||||||
|  | 
 | ||||||
|  |         if entity.tableProfilerConfig: | ||||||
|  |             return entity.tableProfilerConfig.excludeColumns | ||||||
|  | 
 | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def _copy_service_config( | ||||||
|  |         self, config: OpenMetadataWorkflowConfig, database: DatabaseService | ||||||
|  |     ) -> DatabaseConnection: | ||||||
|  |         """Make a copy of the service config and update the database name | ||||||
|  | 
 | ||||||
|  |         Args: | ||||||
|  |             database (_type_): a database entity | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             DatabaseService.__config__ | ||||||
|  |         """ | ||||||
|  |         config_copy = deepcopy( | ||||||
|  |             config.source.serviceConnection.__root__.config  # type: ignore | ||||||
|  |         ) | ||||||
|  |         if hasattr( | ||||||
|  |             config_copy,  # type: ignore | ||||||
|  |             "supportsDatabase", | ||||||
|  |         ): | ||||||
|  |             if hasattr(config_copy, "database"): | ||||||
|  |                 config_copy.database = database.name.__root__  # type: ignore | ||||||
|  |             if hasattr(config_copy, "catalog"): | ||||||
|  |                 config_copy.catalog = database.name.__root__  # type: ignore | ||||||
|  | 
 | ||||||
|  |         # we know we'll only be working with DatabaseConnection, we cast the type to satisfy type checker | ||||||
|  |         config_copy = cast(DatabaseConnection, config_copy) | ||||||
|  | 
 | ||||||
|  |         return config_copy | ||||||
|  | 
 | ||||||
|  |     def create_profiler_interface( | ||||||
|  |         self, | ||||||
|  |         entity: Table, | ||||||
|  |         table_config: Optional[TableConfig], | ||||||
|  |     ) -> Union[SQAProfilerInterface, PandasProfilerInterface]: | ||||||
|  |         """Create sqlalchemy profiler interface""" | ||||||
|  |         profiler_interface: Union[ | ||||||
|  |             SQAProfilerInterface, PandasProfilerInterface | ||||||
|  |         ] = ProfilerProtocol.create( | ||||||
|  |             self.profiler_interface_type, | ||||||
|  |             entity, | ||||||
|  |             table_config, | ||||||
|  |             self.source_config, | ||||||
|  |             self.service_conn_config, | ||||||
|  |             self.ometa_client, | ||||||
|  |             sqa_metadata=self.sqa_metadata, | ||||||
|  |         )  # type: ignore | ||||||
|  | 
 | ||||||
|  |         self.interface = profiler_interface | ||||||
|  |         return self.interface | ||||||
|  | 
 | ||||||
|  |     def get_profiler_runner( | ||||||
|  |         self, entity: Table, profiler_config: ProfilerProcessorConfig | ||||||
|  |     ) -> Profiler: | ||||||
|  |         """ | ||||||
|  |         Returns the runner for the profiler | ||||||
|  |         """ | ||||||
|  |         table_config = self._get_config_for_table(entity, profiler_config) | ||||||
|  |         profiler_interface = self.create_profiler_interface( | ||||||
|  |             entity, | ||||||
|  |             table_config, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         if not profiler_config.profiler: | ||||||
|  |             return DefaultProfiler( | ||||||
|  |                 profiler_interface=profiler_interface, | ||||||
|  |                 include_columns=self._get_include_columns(entity, table_config), | ||||||
|  |                 exclude_columns=self._get_exclude_columns(entity, table_config), | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         metrics = ( | ||||||
|  |             [Metrics.get(name) for name in profiler_config.profiler.metrics] | ||||||
|  |             if profiler_config.profiler.metrics | ||||||
|  |             else get_default_metrics(profiler_interface.table) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         return Profiler( | ||||||
|  |             *metrics,  # type: ignore | ||||||
|  |             profiler_interface=profiler_interface, | ||||||
|  |             include_columns=self._get_include_columns(entity, table_config), | ||||||
|  |             exclude_columns=self._get_exclude_columns(entity, table_config), | ||||||
|  |         ) | ||||||
| @ -0,0 +1,61 @@ | |||||||
|  | #  Copyright 2021 Collate | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Bigquery Profiler source | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from copy import deepcopy | ||||||
|  | 
 | ||||||
|  | from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( | ||||||
|  |     BigQueryConnection, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.entity.services.databaseService import DatabaseService | ||||||
|  | from metadata.generated.schema.metadataIngestion.workflow import ( | ||||||
|  |     OpenMetadataWorkflowConfig, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.security.credentials.gcsValues import ( | ||||||
|  |     GcsCredentialsValues, | ||||||
|  |     MultipleProjectId, | ||||||
|  |     SingleProjectId, | ||||||
|  | ) | ||||||
|  | from metadata.profiler.source.base_profiler_source import BaseProfilerSource | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class BigQueryProfilerSource(BaseProfilerSource): | ||||||
|  |     """override the base profiler source to handle BigQuery specific connection configs""" | ||||||
|  | 
 | ||||||
|  |     def _copy_service_config( | ||||||
|  |         self, config: OpenMetadataWorkflowConfig, database: DatabaseService | ||||||
|  |     ) -> BigQueryConnection: | ||||||
|  |         """Make a copy of the database connection config. If MultiProjectId is used, replace it | ||||||
|  |         with SingleProjectId with the database name being profiled. We iterate over all non filtered | ||||||
|  |         database in workflow.py `def execute`. | ||||||
|  | 
 | ||||||
|  |         Args: | ||||||
|  |             database (DatabaseService): a database entity | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |             DatabaseConnection | ||||||
|  |         """ | ||||||
|  |         config_copy: BigQueryConnection = deepcopy( | ||||||
|  |             config.source.serviceConnection.__root__.config  # type: ignore | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         if isinstance(config_copy.credentials.gcsConfig, GcsCredentialsValues): | ||||||
|  |             if isinstance( | ||||||
|  |                 config_copy.credentials.gcsConfig.projectId, MultipleProjectId | ||||||
|  |             ): | ||||||
|  |                 config_copy.credentials.gcsConfig.projectId = SingleProjectId( | ||||||
|  |                     __root__=database.name.__root__ | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |         return config_copy | ||||||
| @ -0,0 +1,45 @@ | |||||||
|  | #  Copyright 2021 Collate | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Factory class for creating profiler source objects | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( | ||||||
|  |     BigqueryType, | ||||||
|  | ) | ||||||
|  | from metadata.profiler.source.base_profiler_source import BaseProfilerSource | ||||||
|  | from metadata.profiler.source.bigquery.profiler_source import BigQueryProfilerSource | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class ProfilerSourceFactory: | ||||||
|  |     """Creational factory for profiler source objects""" | ||||||
|  | 
 | ||||||
|  |     def __init__(self): | ||||||
|  |         self._source_type = {"base": BaseProfilerSource} | ||||||
|  | 
 | ||||||
|  |     def register_source(self, source_type: str, source_class): | ||||||
|  |         """Register a new source type""" | ||||||
|  |         self._source_type[source_type] = source_class | ||||||
|  | 
 | ||||||
|  |     def create(self, source_type: str, *args, **kwargs) -> BaseProfilerSource: | ||||||
|  |         """Create source object based on source type""" | ||||||
|  |         source_class = self._source_type.get(source_type) | ||||||
|  |         if not source_class: | ||||||
|  |             source_class = self._source_type["base"] | ||||||
|  |             return source_class(*args, **kwargs) | ||||||
|  |         return source_class(*args, **kwargs) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | profiler_source_factory = ProfilerSourceFactory() | ||||||
|  | profiler_source_factory.register_source( | ||||||
|  |     BigqueryType.BigQuery.value.lower(), BigQueryProfilerSource | ||||||
|  | ) | ||||||
| @ -30,7 +30,11 @@ from metadata.generated.schema.entity.data.table import ( | |||||||
| from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( | from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( | ||||||
|     OpenMetadataConnection, |     OpenMetadataConnection, | ||||||
| ) | ) | ||||||
| from metadata.generated.schema.entity.services.databaseService import DatabaseConnection | from metadata.generated.schema.entity.services.databaseService import ( | ||||||
|  |     DatabaseConnection, | ||||||
|  |     DatabaseService, | ||||||
|  |     DatabaseServiceType, | ||||||
|  | ) | ||||||
| from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( | from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( | ||||||
|     DatabaseServiceProfilerPipeline, |     DatabaseServiceProfilerPipeline, | ||||||
| ) | ) | ||||||
| @ -42,6 +46,7 @@ from metadata.profiler.interface.sqlalchemy.sqa_profiler_interface import ( | |||||||
|     SQAProfilerInterface, |     SQAProfilerInterface, | ||||||
| ) | ) | ||||||
| from metadata.profiler.processor.default import DefaultProfiler | from metadata.profiler.processor.default import DefaultProfiler | ||||||
|  | from metadata.profiler.source.base_profiler_source import BaseProfilerSource | ||||||
| 
 | 
 | ||||||
| TABLE = Table( | TABLE = Table( | ||||||
|     id=uuid.uuid4(), |     id=uuid.uuid4(), | ||||||
| @ -228,20 +233,21 @@ def test_profile_def(mocked_method, mocked_orm):  # pylint: disable=unused-argum | |||||||
|     profile_workflow = ProfilerWorkflow.create(profile_config) |     profile_workflow = ProfilerWorkflow.create(profile_config) | ||||||
|     mocked_method.assert_called() |     mocked_method.assert_called() | ||||||
| 
 | 
 | ||||||
|     profiler_interface: SQAProfilerInterface = ProfilerProtocol.create( |     profiler_source = BaseProfilerSource( | ||||||
|         _profiler_type=DatabaseConnection.__name__, |         profile_workflow.config, | ||||||
|         entity=TABLE, |         DatabaseService( | ||||||
|         entity_config=profile_workflow.get_config_for_entity(TABLE), |             id=uuid.uuid4(), | ||||||
|         source_config=profile_workflow.source_config, |             name="myDataBaseService", | ||||||
|         service_connection_config=profile_workflow.config.source.serviceConnection.__root__.config, |             serviceType=DatabaseServiceType.SQLite, | ||||||
|         ometa_client=None, |         ),  # type: ignore | ||||||
|         sqa_metadata=MetaData(), |         profile_workflow.metadata, | ||||||
|  |     ) | ||||||
|  |     profiler_runner = profiler_source.get_profiler_runner( | ||||||
|  |         TABLE, profile_workflow.profiler_config | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     profile_workflow.create_profiler(TABLE, profiler_interface) |     # profile_workflow.create_profiler(TABLE, profiler_interface) | ||||||
|     profiler_obj_metrics = [ |     profiler_obj_metrics = [metric.name() for metric in profiler_runner.metrics] | ||||||
|         metric.name() for metric in profile_workflow.profiler.metrics |  | ||||||
|     ] |  | ||||||
| 
 | 
 | ||||||
|     assert profile_workflow.profiler_config.profiler |     assert profile_workflow.profiler_config.profiler | ||||||
|     assert config_metrics_label == profiler_obj_metrics |     assert config_metrics_label == profiler_obj_metrics | ||||||
| @ -268,20 +274,21 @@ def test_default_profile_def( | |||||||
|     profile_workflow = ProfilerWorkflow.create(config) |     profile_workflow = ProfilerWorkflow.create(config) | ||||||
|     mocked_method.assert_called() |     mocked_method.assert_called() | ||||||
| 
 | 
 | ||||||
|     profiler_interface: SQAProfilerInterface = ProfilerProtocol.create( |     profiler_source = BaseProfilerSource( | ||||||
|         _profiler_type=DatabaseConnection.__name__, |         profile_workflow.config, | ||||||
|         entity=TABLE, |         DatabaseService( | ||||||
|         entity_config=profile_workflow.get_config_for_entity(TABLE), |             id=uuid.uuid4(), | ||||||
|         source_config=profile_workflow.source_config, |             name="myDataBaseService", | ||||||
|         service_connection_config=profile_workflow.config.source.serviceConnection.__root__.config, |             serviceType=DatabaseServiceType.SQLite, | ||||||
|         ometa_client=None, |         ),  # type: ignore | ||||||
|         sqa_metadata=MetaData(), |         profile_workflow.metadata, | ||||||
|  |     ) | ||||||
|  |     profiler_runner = profiler_source.get_profiler_runner( | ||||||
|  |         TABLE, profile_workflow.profiler_config | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     profile_workflow.create_profiler(TABLE, profiler_interface) |  | ||||||
| 
 |  | ||||||
|     assert isinstance( |     assert isinstance( | ||||||
|         profile_workflow.profiler, |         profiler_runner, | ||||||
|         DefaultProfiler, |         DefaultProfiler, | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Teddy
						Teddy