mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 02:29:03 +00:00 
			
		
		
		
	* Usage through query log file * Pick fileds if available in file * Created Usage Souce
This commit is contained in:
		
							parent
							
								
									400818d04d
								
							
						
					
					
						commit
						fd521763d4
					
				| @ -7,9 +7,7 @@ | |||||||
|     "databaseUsageConfigType": { |     "databaseUsageConfigType": { | ||||||
|       "description": "Database Source Config Usage Pipeline type", |       "description": "Database Source Config Usage Pipeline type", | ||||||
|       "type": "string", |       "type": "string", | ||||||
|       "enum": [ |       "enum": ["DatabaseUsage"], | ||||||
|         "DatabaseUsage" |  | ||||||
|       ], |  | ||||||
|       "default": "DatabaseUsage" |       "default": "DatabaseUsage" | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
| @ -33,6 +31,10 @@ | |||||||
|       "description": "Configuration to set the limit for query logs", |       "description": "Configuration to set the limit for query logs", | ||||||
|       "type": "integer", |       "type": "integer", | ||||||
|       "default": "100" |       "default": "100" | ||||||
|  |     }, | ||||||
|  |     "queryLogFilePath": { | ||||||
|  |       "description": "Configuration to set the file path for query logs", | ||||||
|  |       "type": "string" | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   "additionalProperties": false |   "additionalProperties": false | ||||||
|  | |||||||
| @ -13,7 +13,8 @@ | |||||||
|     }, |     }, | ||||||
|     "sourceConfig": { |     "sourceConfig": { | ||||||
|       "config": { |       "config": { | ||||||
|         "queryLogDuration": "1" |         "queryLogDuration": "1", | ||||||
|  |         "queryLogFilePath": "<path to query log file>" | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|  | |||||||
							
								
								
									
										46
									
								
								ingestion/examples/workflows/query_log_usage.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								ingestion/examples/workflows/query_log_usage.json
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,46 @@ | |||||||
|  | { | ||||||
|  |     "source": { | ||||||
|  |         "type": "query-log-usage", | ||||||
|  |         "serviceName": "local_mysql", | ||||||
|  |         "serviceConnection": { | ||||||
|  |             "config": { | ||||||
|  |                 "type": "Mysql", | ||||||
|  |                 "username": "openmetadata_user", | ||||||
|  |                 "password": "openmetadata_password", | ||||||
|  |                 "hostPort": "localhost:3306", | ||||||
|  |                 "connectionOptions": {}, | ||||||
|  |                 "connectionArguments": {} | ||||||
|  |             } | ||||||
|  |         }, | ||||||
|  |         "sourceConfig": { | ||||||
|  |             "config": { | ||||||
|  |                 "queryLogDuration": "1", | ||||||
|  |                 "queryLogFilePath": "<path to query log file>" | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     }, | ||||||
|  |     "processor": { | ||||||
|  |         "type": "query-parser", | ||||||
|  |         "config": { | ||||||
|  |             "filter": "" | ||||||
|  |         } | ||||||
|  |     }, | ||||||
|  |     "stage": { | ||||||
|  |         "type": "table-usage", | ||||||
|  |         "config": { | ||||||
|  |             "filename": "/tmp/query_log_usage" | ||||||
|  |         } | ||||||
|  |     }, | ||||||
|  |     "bulkSink": { | ||||||
|  |         "type": "metadata-usage", | ||||||
|  |         "config": { | ||||||
|  |             "filename": "/tmp/query_log_usage" | ||||||
|  |         } | ||||||
|  |     }, | ||||||
|  |     "workflowConfig": { | ||||||
|  |         "openMetadataServerConfig": { | ||||||
|  |             "hostPort": "http://localhost:8585/api", | ||||||
|  |             "authProvider": "no-auth" | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -13,7 +13,7 @@ Clickhouse usage module | |||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| import ast | import ast | ||||||
| from typing import Any, Dict, Iterable | from typing import Iterable | ||||||
| 
 | 
 | ||||||
| from metadata.generated.schema.entity.services.connections.database.clickhouseConnection import ( | from metadata.generated.schema.entity.services.connections.database.clickhouseConnection import ( | ||||||
|     ClickhouseConnection, |     ClickhouseConnection, | ||||||
| @ -27,14 +27,15 @@ from metadata.ingestion.api.source import InvalidSourceException, Source, Source | |||||||
| # This import verifies that the dependencies are available. | # This import verifies that the dependencies are available. | ||||||
| from metadata.ingestion.models.table_queries import TableQuery | from metadata.ingestion.models.table_queries import TableQuery | ||||||
| from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus | from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus | ||||||
|  | from metadata.ingestion.source.usage_source import UsageSource | ||||||
| from metadata.utils.connections import get_connection, test_connection | from metadata.utils.connections import get_connection, test_connection | ||||||
| from metadata.utils.helpers import get_start_and_end | from metadata.utils.helpers import get_start_and_end | ||||||
| from metadata.utils.sql_queries import CLICKHOUSE_SQL_USAGE_STATEMENT | from metadata.utils.sql_queries import CLICKHOUSE_SQL_USAGE_STATEMENT | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ClickhouseUsageSource(Source[TableQuery]): | class ClickhouseUsageSource(UsageSource): | ||||||
|     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): |     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): | ||||||
|         super().__init__() |         super().__init__(config, metadata_config) | ||||||
|         self.config = config |         self.config = config | ||||||
|         self.connection = config.serviceConnection.__root__.config |         self.connection = config.serviceConnection.__root__.config | ||||||
|         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) |         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) | ||||||
| @ -56,15 +57,6 @@ class ClickhouseUsageSource(Source[TableQuery]): | |||||||
| 
 | 
 | ||||||
|         return cls(config, metadata_config) |         return cls(config, metadata_config) | ||||||
| 
 | 
 | ||||||
|     def prepare(self): |  | ||||||
|         return super().prepare() |  | ||||||
| 
 |  | ||||||
|     def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: |  | ||||||
| 
 |  | ||||||
|         rows = self.engine.execute(self.sql_stmt) |  | ||||||
|         for row in rows: |  | ||||||
|             yield row |  | ||||||
| 
 |  | ||||||
|     def next_record(self) -> Iterable[TableQuery]: |     def next_record(self) -> Iterable[TableQuery]: | ||||||
|         """ |         """ | ||||||
|         Using itertools.groupby and raw level iterator, |         Using itertools.groupby and raw level iterator, | ||||||
| @ -89,20 +81,3 @@ class ClickhouseUsageSource(Source[TableQuery]): | |||||||
|                 service_name=self.config.serviceName, |                 service_name=self.config.serviceName, | ||||||
|             ) |             ) | ||||||
|             yield table_query |             yield table_query | ||||||
| 
 |  | ||||||
|     def get_report(self): |  | ||||||
|         """ |  | ||||||
|         get report |  | ||||||
| 
 |  | ||||||
|         Returns: |  | ||||||
|         """ |  | ||||||
|         return self.report |  | ||||||
| 
 |  | ||||||
|     def close(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def get_status(self) -> SourceStatus: |  | ||||||
|         return self.report |  | ||||||
| 
 |  | ||||||
|     def test_connection(self) -> None: |  | ||||||
|         test_connection(self.engine) |  | ||||||
|  | |||||||
| @ -11,36 +11,30 @@ | |||||||
| """ | """ | ||||||
| MSSQL usage module | MSSQL usage module | ||||||
| """ | """ | ||||||
| 
 |  | ||||||
| from typing import Any, Dict, Iterable |  | ||||||
| 
 |  | ||||||
| from metadata.generated.schema.entity.services.connections.database.mssqlConnection import ( | from metadata.generated.schema.entity.services.connections.database.mssqlConnection import ( | ||||||
|     MssqlConnection, |     MssqlConnection, | ||||||
| ) | ) | ||||||
|  | from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( | ||||||
|  |     OpenMetadataConnection, | ||||||
|  | ) | ||||||
| from metadata.generated.schema.metadataIngestion.workflow import ( | from metadata.generated.schema.metadataIngestion.workflow import ( | ||||||
|     Source as WorkflowSource, |     Source as WorkflowSource, | ||||||
| ) | ) | ||||||
| from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | ||||||
| from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus | from metadata.ingestion.api.source import InvalidSourceException | ||||||
|  | from metadata.ingestion.source.usage_source import UsageSource | ||||||
| 
 | 
 | ||||||
| # This import verifies that the dependencies are available. | # This import verifies that the dependencies are available. | ||||||
| from metadata.ingestion.models.table_queries import TableQuery |  | ||||||
| from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus |  | ||||||
| from metadata.utils.connections import get_connection, test_connection |  | ||||||
| from metadata.utils.helpers import get_start_and_end | from metadata.utils.helpers import get_start_and_end | ||||||
| from metadata.utils.sql_queries import MSSQL_SQL_USAGE_STATEMENT | from metadata.utils.sql_queries import MSSQL_SQL_USAGE_STATEMENT | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class MssqlUsageSource(Source[TableQuery]): | class MssqlUsageSource(UsageSource): | ||||||
|     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): |     def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): | ||||||
|         super().__init__() |         super().__init__(config, metadata_config) | ||||||
|         self.config = config |         start, end = get_start_and_end(config.sourceConfig.config.queryLogDuration) | ||||||
|         self.connection = config.serviceConnection.__root__.config |  | ||||||
|         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) |  | ||||||
|         self.analysis_date = start |         self.analysis_date = start | ||||||
|         self.sql_stmt = MSSQL_SQL_USAGE_STATEMENT.format(start_date=start, end_date=end) |         self.sql_stmt = MSSQL_SQL_USAGE_STATEMENT.format(start_date=start, end_date=end) | ||||||
|         self.report = SQLSourceStatus() |  | ||||||
|         self.engine = get_connection(self.connection) |  | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create(cls, config_dict, metadata_config: WorkflowConfig): |     def create(cls, config_dict, metadata_config: WorkflowConfig): | ||||||
| @ -52,53 +46,3 @@ class MssqlUsageSource(Source[TableQuery]): | |||||||
|                 f"Expected MssqlConnection, but got {connection}" |                 f"Expected MssqlConnection, but got {connection}" | ||||||
|             ) |             ) | ||||||
|         return cls(config, metadata_config) |         return cls(config, metadata_config) | ||||||
| 
 |  | ||||||
|     def prepare(self): |  | ||||||
|         return super().prepare() |  | ||||||
| 
 |  | ||||||
|     def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: |  | ||||||
| 
 |  | ||||||
|         rows = self.engine.execute(self.sql_stmt) |  | ||||||
|         for row in rows: |  | ||||||
|             yield row |  | ||||||
| 
 |  | ||||||
|     def next_record(self) -> Iterable[TableQuery]: |  | ||||||
|         """ |  | ||||||
|         Using itertools.groupby and raw level iterator, |  | ||||||
|         it groups to table and yields TableMetadata |  | ||||||
|         :return: |  | ||||||
|         """ |  | ||||||
|         for row in self._get_raw_extract_iter(): |  | ||||||
|             table_query = TableQuery( |  | ||||||
|                 query=row["query_type"], |  | ||||||
|                 user_name=row["user_name"], |  | ||||||
|                 starttime=str(row["start_time"]), |  | ||||||
|                 endtime=str(row["end_time"]), |  | ||||||
|                 analysis_date=self.analysis_date, |  | ||||||
|                 aborted=row["aborted"], |  | ||||||
|                 database=row["database_name"], |  | ||||||
|                 sql=row["query_text"], |  | ||||||
|                 service_name=self.config.serviceName, |  | ||||||
|             ) |  | ||||||
|             if row["schema_name"] is not None: |  | ||||||
|                 self.report.scanned(f"{row['database_name']}.{row['schema_name']}") |  | ||||||
|             else: |  | ||||||
|                 self.report.scanned(f"{row['database_name']}") |  | ||||||
|             yield table_query |  | ||||||
| 
 |  | ||||||
|     def get_report(self): |  | ||||||
|         """ |  | ||||||
|         get report |  | ||||||
| 
 |  | ||||||
|         Returns: |  | ||||||
|         """ |  | ||||||
|         return self.report |  | ||||||
| 
 |  | ||||||
|     def close(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def get_status(self) -> SourceStatus: |  | ||||||
|         return self.report |  | ||||||
| 
 |  | ||||||
|     def test_connection(self) -> None: |  | ||||||
|         test_connection(self.engine) |  | ||||||
|  | |||||||
							
								
								
									
										35
									
								
								ingestion/src/metadata/ingestion/source/query_log_usage.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								ingestion/src/metadata/ingestion/source/query_log_usage.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,35 @@ | |||||||
|  | #  Copyright 2021 Collate | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | """ | ||||||
|  | Common Query Log Connector | ||||||
|  | """ | ||||||
|  | from datetime import datetime | ||||||
|  | 
 | ||||||
|  | from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( | ||||||
|  |     OpenMetadataConnection, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.metadataIngestion.workflow import ( | ||||||
|  |     Source as WorkflowSource, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | ||||||
|  | from metadata.ingestion.source.usage_source import UsageSource | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class QueryLogUsageSource(UsageSource): | ||||||
|  |     def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): | ||||||
|  |         super().__init__(config, metadata_config) | ||||||
|  |         self.analysis_date = datetime.today().strftime("%Y-%m-%d %H:%M:%S") | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def create(cls, config_dict, metadata_config: WorkflowConfig): | ||||||
|  |         """Create class instance""" | ||||||
|  |         config: WorkflowSource = WorkflowSource.parse_obj(config_dict) | ||||||
|  |         return cls(config, metadata_config) | ||||||
| @ -13,7 +13,7 @@ Redshift usage module | |||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| # This import verifies that the dependencies are available. | # This import verifies that the dependencies are available. | ||||||
| from typing import Any, Dict, Iterable, Iterator, Union | from typing import Iterator, Union | ||||||
| 
 | 
 | ||||||
| from metadata.generated.schema.entity.services.connections.database.redshiftConnection import ( | from metadata.generated.schema.entity.services.connections.database.redshiftConnection import ( | ||||||
|     RedshiftConnection, |     RedshiftConnection, | ||||||
| @ -25,11 +25,8 @@ from metadata.generated.schema.metadataIngestion.workflow import ( | |||||||
|     Source as WorkflowSource, |     Source as WorkflowSource, | ||||||
| ) | ) | ||||||
| from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | ||||||
| from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus | from metadata.ingestion.api.source import InvalidSourceException | ||||||
| from metadata.ingestion.models.table_queries import TableQuery | from metadata.ingestion.source.usage_source import UsageSource | ||||||
| from metadata.ingestion.ometa.ometa_api import OpenMetadata |  | ||||||
| from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus |  | ||||||
| from metadata.utils.connections import get_connection, test_connection |  | ||||||
| from metadata.utils.helpers import get_start_and_end | from metadata.utils.helpers import get_start_and_end | ||||||
| 
 | 
 | ||||||
| # pylint: disable=useless-super-delegation | # pylint: disable=useless-super-delegation | ||||||
| @ -39,7 +36,7 @@ from metadata.utils.sql_queries import REDSHIFT_SQL_STATEMENT | |||||||
| logger = ingestion_logger() | logger = ingestion_logger() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class RedshiftUsageSource(Source[TableQuery]): | class RedshiftUsageSource(UsageSource): | ||||||
|     # SELECT statement from mysql information_schema to extract table and column metadata |     # SELECT statement from mysql information_schema to extract table and column metadata | ||||||
|     SQL_STATEMENT = REDSHIFT_SQL_STATEMENT |     SQL_STATEMENT = REDSHIFT_SQL_STATEMENT | ||||||
|     # CONFIG KEYS |     # CONFIG KEYS | ||||||
| @ -52,11 +49,7 @@ class RedshiftUsageSource(Source[TableQuery]): | |||||||
|     DEFAULT_CLUSTER_SOURCE = "CURRENT_DATABASE()" |     DEFAULT_CLUSTER_SOURCE = "CURRENT_DATABASE()" | ||||||
| 
 | 
 | ||||||
|     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): |     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): | ||||||
|         super().__init__() |         super().__init__(config, metadata_config) | ||||||
|         self.config = config |  | ||||||
|         self.service_connection = config.serviceConnection.__root__.config |  | ||||||
|         self.metadata_config = metadata_config |  | ||||||
|         self.metadata = OpenMetadata(metadata_config) |  | ||||||
|         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) |         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) | ||||||
|         self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format( |         self.sql_stmt = RedshiftUsageSource.SQL_STATEMENT.format( | ||||||
|             start_time=start, end_time=end |             start_time=start, end_time=end | ||||||
| @ -64,8 +57,6 @@ class RedshiftUsageSource(Source[TableQuery]): | |||||||
|         self.analysis_date = start |         self.analysis_date = start | ||||||
|         self._extract_iter: Union[None, Iterator] = None |         self._extract_iter: Union[None, Iterator] = None | ||||||
|         self._database = "redshift" |         self._database = "redshift" | ||||||
|         self.status = SQLSourceStatus() |  | ||||||
|         self.engine = get_connection(self.service_connection) |  | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create(cls, config_dict, metadata_config: WorkflowConfig): |     def create(cls, config_dict, metadata_config: WorkflowConfig): | ||||||
| @ -76,40 +67,3 @@ class RedshiftUsageSource(Source[TableQuery]): | |||||||
|                 f"Expected RedshiftConnection, but got {connection}" |                 f"Expected RedshiftConnection, but got {connection}" | ||||||
|             ) |             ) | ||||||
|         return cls(config, metadata_config) |         return cls(config, metadata_config) | ||||||
| 
 |  | ||||||
|     def prepare(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: |  | ||||||
| 
 |  | ||||||
|         rows = self.engine.execute(self.sql_stmt) |  | ||||||
|         for row in rows: |  | ||||||
|             yield row |  | ||||||
| 
 |  | ||||||
|     def next_record(self) -> Iterable[TableQuery]: |  | ||||||
|         """ |  | ||||||
|         Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata |  | ||||||
|         :return: |  | ||||||
|         """ |  | ||||||
|         for row in self._get_raw_extract_iter(): |  | ||||||
|             tq = TableQuery( |  | ||||||
|                 query=row["query"], |  | ||||||
|                 user_name=row["usename"], |  | ||||||
|                 starttime=str(row["starttime"]), |  | ||||||
|                 endtime=str(row["endtime"]), |  | ||||||
|                 analysis_date=str(self.analysis_date), |  | ||||||
|                 database=self.service_connection.database, |  | ||||||
|                 aborted=row["aborted"], |  | ||||||
|                 sql=row["querytxt"], |  | ||||||
|                 service_name=self.config.serviceName, |  | ||||||
|             ) |  | ||||||
|             yield tq |  | ||||||
| 
 |  | ||||||
|     def close(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def get_status(self) -> SourceStatus: |  | ||||||
|         return self.status |  | ||||||
| 
 |  | ||||||
|     def test_connection(self) -> None: |  | ||||||
|         test_connection(self.engine) |  | ||||||
|  | |||||||
| @ -14,7 +14,7 @@ Snowflake usage module | |||||||
| 
 | 
 | ||||||
| import traceback | import traceback | ||||||
| from datetime import timedelta | from datetime import timedelta | ||||||
| from typing import Any, Dict, Iterable, Iterator, Union | from typing import Iterable, Iterator, Union | ||||||
| 
 | 
 | ||||||
| from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( | from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( | ||||||
|     SnowflakeConnection, |     SnowflakeConnection, | ||||||
| @ -26,12 +26,11 @@ from metadata.generated.schema.metadataIngestion.workflow import ( | |||||||
|     Source as WorkflowSource, |     Source as WorkflowSource, | ||||||
| ) | ) | ||||||
| from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | ||||||
| from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus | from metadata.ingestion.api.source import InvalidSourceException | ||||||
| 
 | 
 | ||||||
| # This import verifies that the dependencies are available. | # This import verifies that the dependencies are available. | ||||||
| from metadata.ingestion.models.table_queries import TableQuery | from metadata.ingestion.models.table_queries import TableQuery | ||||||
| from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus | from metadata.ingestion.source.usage_source import UsageSource | ||||||
| from metadata.utils.connections import get_connection, test_connection |  | ||||||
| from metadata.utils.helpers import get_start_and_end | from metadata.utils.helpers import get_start_and_end | ||||||
| from metadata.utils.logger import ingestion_logger | from metadata.utils.logger import ingestion_logger | ||||||
| from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT | from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT | ||||||
| @ -39,8 +38,7 @@ from metadata.utils.sql_queries import SNOWFLAKE_SQL_STATEMENT | |||||||
| logger = ingestion_logger() | logger = ingestion_logger() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SnowflakeUsageSource(Source[TableQuery]): | class SnowflakeUsageSource(UsageSource): | ||||||
| 
 |  | ||||||
|     # SELECT statement from mysql information_schema |     # SELECT statement from mysql information_schema | ||||||
|     # to extract table and column metadata |     # to extract table and column metadata | ||||||
|     SQL_STATEMENT = SNOWFLAKE_SQL_STATEMENT |     SQL_STATEMENT = SNOWFLAKE_SQL_STATEMENT | ||||||
| @ -55,13 +53,10 @@ class SnowflakeUsageSource(Source[TableQuery]): | |||||||
|     DEFAULT_CLUSTER_SOURCE = "CURRENT_DATABASE()" |     DEFAULT_CLUSTER_SOURCE = "CURRENT_DATABASE()" | ||||||
| 
 | 
 | ||||||
|     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): |     def __init__(self, config: WorkflowSource, metadata_config: WorkflowConfig): | ||||||
|         super().__init__() |         super().__init__(config, metadata_config) | ||||||
|         self.config = config |  | ||||||
|         self.service_connection = config.serviceConnection.__root__.config |  | ||||||
|         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) |         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) | ||||||
|         end = end + timedelta(days=1) |         end = end + timedelta(days=1) | ||||||
|         self.analysis_date = start |         self.analysis_date = start | ||||||
|         self.metadata_config = metadata_config |  | ||||||
|         self.sql_stmt = SnowflakeUsageSource.SQL_STATEMENT.format( |         self.sql_stmt = SnowflakeUsageSource.SQL_STATEMENT.format( | ||||||
|             start_date=start, |             start_date=start, | ||||||
|             end_date=end, |             end_date=end, | ||||||
| @ -69,8 +64,6 @@ class SnowflakeUsageSource(Source[TableQuery]): | |||||||
|         ) |         ) | ||||||
|         self._extract_iter: Union[None, Iterator] = None |         self._extract_iter: Union[None, Iterator] = None | ||||||
|         self._database = "Snowflake" |         self._database = "Snowflake" | ||||||
|         self.report = SQLSourceStatus() |  | ||||||
|         self.engine = get_connection(self.service_connection) |  | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create(cls, config_dict, metadata_config: WorkflowConfig): |     def create(cls, config_dict, metadata_config: WorkflowConfig): | ||||||
| @ -82,15 +75,6 @@ class SnowflakeUsageSource(Source[TableQuery]): | |||||||
|             ) |             ) | ||||||
|         return cls(config, metadata_config) |         return cls(config, metadata_config) | ||||||
| 
 | 
 | ||||||
|     def prepare(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: |  | ||||||
| 
 |  | ||||||
|         rows = self.engine.execute(self.sql_stmt) |  | ||||||
|         for row in rows: |  | ||||||
|             yield row |  | ||||||
| 
 |  | ||||||
|     def next_record(self) -> Iterable[TableQuery]: |     def next_record(self) -> Iterable[TableQuery]: | ||||||
|         """ |         """ | ||||||
|         Using itertools.groupby and raw level iterator, |         Using itertools.groupby and raw level iterator, | ||||||
| @ -121,20 +105,3 @@ class SnowflakeUsageSource(Source[TableQuery]): | |||||||
|             except Exception as err: |             except Exception as err: | ||||||
|                 logger.debug(traceback.format_exc()) |                 logger.debug(traceback.format_exc()) | ||||||
|                 logger.debug(repr(err)) |                 logger.debug(repr(err)) | ||||||
| 
 |  | ||||||
|     def get_report(self): |  | ||||||
|         """ |  | ||||||
|         get report |  | ||||||
| 
 |  | ||||||
|         Returns: |  | ||||||
|         """ |  | ||||||
|         return self.report |  | ||||||
| 
 |  | ||||||
|     def test_connection(self) -> None: |  | ||||||
|         test_connection(self.engine) |  | ||||||
| 
 |  | ||||||
|     def close(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     def get_status(self) -> SourceStatus: |  | ||||||
|         return self.report |  | ||||||
|  | |||||||
							
								
								
									
										113
									
								
								ingestion/src/metadata/ingestion/source/usage_source.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								ingestion/src/metadata/ingestion/source/usage_source.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,113 @@ | |||||||
|  | #  Copyright 2021 Collate | ||||||
|  | #  Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | #  you may not use this file except in compliance with the License. | ||||||
|  | #  You may obtain a copy of the License at | ||||||
|  | #  http://www.apache.org/licenses/LICENSE-2.0 | ||||||
|  | #  Unless required by applicable law or agreed to in writing, software | ||||||
|  | #  distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | #  See the License for the specific language governing permissions and | ||||||
|  | #  limitations under the License. | ||||||
|  | """ | ||||||
|  | Usage Souce Module | ||||||
|  | """ | ||||||
|  | import csv | ||||||
|  | from typing import Any, Dict, Iterable | ||||||
|  | 
 | ||||||
|  | from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( | ||||||
|  |     OpenMetadataConnection, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.metadataIngestion.workflow import ( | ||||||
|  |     Source as WorkflowSource, | ||||||
|  | ) | ||||||
|  | from metadata.generated.schema.metadataIngestion.workflow import WorkflowConfig | ||||||
|  | from metadata.ingestion.api.source import InvalidSourceException, Source, SourceStatus | ||||||
|  | 
 | ||||||
|  | # This import verifies that the dependencies are available. | ||||||
|  | from metadata.ingestion.models.table_queries import TableQuery | ||||||
|  | from metadata.ingestion.source.sql_alchemy_helper import SQLSourceStatus | ||||||
|  | from metadata.utils.connections import get_connection, test_connection | ||||||
|  | from metadata.utils.helpers import get_start_and_end | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class UsageSource(Source[TableQuery]): | ||||||
|  |     def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): | ||||||
|  |         super().__init__() | ||||||
|  |         self.config = config | ||||||
|  |         self.metadata_config = metadata_config | ||||||
|  |         self.connection = config.serviceConnection.__root__.config | ||||||
|  |         start, end = get_start_and_end(self.config.sourceConfig.config.queryLogDuration) | ||||||
|  |         self.analysis_date = start | ||||||
|  |         self.report = SQLSourceStatus() | ||||||
|  |         self.engine = get_connection(self.connection) | ||||||
|  | 
 | ||||||
|  |     def prepare(self): | ||||||
|  |         return super().prepare() | ||||||
|  | 
 | ||||||
|  |     def _get_raw_extract_iter(self) -> Iterable[Dict[str, Any]]: | ||||||
|  |         if self.config.sourceConfig.config.queryLogFilePath: | ||||||
|  |             with open(self.config.sourceConfig.config.queryLogFilePath, "r") as fin: | ||||||
|  |                 for i in csv.DictReader(fin): | ||||||
|  |                     query_dict = dict(i) | ||||||
|  |                     row = { | ||||||
|  |                         "query_type": query_dict.get("query"), | ||||||
|  |                         "user_name": query_dict.get("user_name", ""), | ||||||
|  |                         "start_time": query_dict.get("start_time", ""), | ||||||
|  |                         "end_time": query_dict.get("end_time", ""), | ||||||
|  |                         "aborted": query_dict.get("aborted", False), | ||||||
|  |                         "database_name": query_dict.get( | ||||||
|  |                             "database_name", | ||||||
|  |                             self.connection.database | ||||||
|  |                             if self.connection.database | ||||||
|  |                             else "default", | ||||||
|  |                         ), | ||||||
|  |                         "query_text": query_dict.get("query"), | ||||||
|  |                         "schema_name": query_dict.get("schema_name"), | ||||||
|  |                     } | ||||||
|  |                     yield row | ||||||
|  |         else: | ||||||
|  |             rows = self.engine.execute(self.sql_stmt) | ||||||
|  |             for row in rows: | ||||||
|  |                 yield row | ||||||
|  | 
 | ||||||
|  |     def next_record(self) -> Iterable[TableQuery]: | ||||||
|  |         """ | ||||||
|  |         Using itertools.groupby and raw level iterator, | ||||||
|  |         it groups to table and yields TableMetadata | ||||||
|  |         :return: | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         for row in self._get_raw_extract_iter(): | ||||||
|  |             table_query = TableQuery( | ||||||
|  |                 query=row["query_type"], | ||||||
|  |                 user_name=row["user_name"], | ||||||
|  |                 starttime=str(row["start_time"]), | ||||||
|  |                 endtime=str(row["end_time"]), | ||||||
|  |                 analysis_date=self.analysis_date, | ||||||
|  |                 aborted=row["aborted"], | ||||||
|  |                 database=row["database_name"], | ||||||
|  |                 sql=row["query_text"], | ||||||
|  |                 service_name=self.config.serviceName, | ||||||
|  |             ) | ||||||
|  |             if not row["schema_name"]: | ||||||
|  |                 self.report.scanned(f"{row['database_name']}.{row['schema_name']}") | ||||||
|  |             else: | ||||||
|  |                 self.report.scanned(f"{row['database_name']}") | ||||||
|  |             yield table_query | ||||||
|  | 
 | ||||||
|  |     def get_report(self): | ||||||
|  |         """ | ||||||
|  |         get report | ||||||
|  | 
 | ||||||
|  |         Returns: | ||||||
|  |         """ | ||||||
|  |         return self.report | ||||||
|  | 
 | ||||||
|  |     def close(self): | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     def get_status(self) -> SourceStatus: | ||||||
|  |         return self.report | ||||||
|  | 
 | ||||||
|  |     def test_connection(self) -> None: | ||||||
|  |         test_connection(self.engine) | ||||||
| @ -2,16 +2,16 @@ import textwrap | |||||||
| 
 | 
 | ||||||
| REDSHIFT_SQL_STATEMENT = """ | REDSHIFT_SQL_STATEMENT = """ | ||||||
|         SELECT DISTINCT ss.userid, |         SELECT DISTINCT ss.userid, | ||||||
|             ss.query, |             ss.query query_type, | ||||||
|             sui.usename, |             sui.usename user_name, | ||||||
|             ss.tbl, |             ss.tbl, | ||||||
|             sq.querytxt, |             sq.querytxt query_text, | ||||||
|             sti.database, |             sti.database database_name, | ||||||
|             sti.schema, |             sti.schema schema_name, | ||||||
|             sti.table, |             sti.table, | ||||||
|             sq.starttime, |             sq.starttime start_time, | ||||||
|             sq.endtime, |             sq.endtime end_time, | ||||||
|             sq.aborted |             sq.aborted aborted | ||||||
|         FROM stl_scan ss |         FROM stl_scan ss | ||||||
|             JOIN svv_table_info sti ON ss.tbl = sti.table_id |             JOIN svv_table_info sti ON ss.tbl = sti.table_id | ||||||
|             JOIN stl_query sq ON ss.query = sq.query |             JOIN stl_query sq ON ss.query = sq.query | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Mayur Singal
						Mayur Singal