mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-28 17:23:28 +00:00
* Add optimizations * optimise and add caching for DI: CA * formatting * Added cache cleanup
This commit is contained in:
parent
92fd428380
commit
ecdcf517ac
@ -18,7 +18,7 @@ from __future__ import annotations
|
|||||||
import traceback
|
import traceback
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Iterable, Optional
|
from typing import Dict, Iterable, Optional
|
||||||
|
|
||||||
from metadata.data_insight.processor.reports.data_processor import DataProcessor
|
from metadata.data_insight.processor.reports.data_processor import DataProcessor
|
||||||
from metadata.generated.schema.analytics.reportData import ReportData, ReportDataType
|
from metadata.generated.schema.analytics.reportData import ReportData, ReportDataType
|
||||||
@ -28,9 +28,11 @@ from metadata.generated.schema.analytics.reportDataType.aggregatedCostAnalysisRe
|
|||||||
from metadata.generated.schema.analytics.reportDataType.rawCostAnalysisReportData import (
|
from metadata.generated.schema.analytics.reportDataType.rawCostAnalysisReportData import (
|
||||||
RawCostAnalysisReportData,
|
RawCostAnalysisReportData,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.entity.data.table import Table
|
from metadata.generated.schema.type.entityReference import EntityReference
|
||||||
from metadata.generated.schema.type.lifeCycle import LifeCycle
|
from metadata.generated.schema.type.lifeCycle import LifeCycle
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.ingestion.ometa.utils import model_str
|
||||||
|
from metadata.utils.constants import ENTITY_REFERENCE_TYPE_MAP
|
||||||
from metadata.utils.logger import data_insight_logger
|
from metadata.utils.logger import data_insight_logger
|
||||||
from metadata.utils.time_utils import get_end_of_day_timestamp_mill
|
from metadata.utils.time_utils import get_end_of_day_timestamp_mill
|
||||||
|
|
||||||
@ -98,36 +100,32 @@ class RawCostAnalysisReportDataProcessor(DataProcessor):
|
|||||||
data=value,
|
data=value,
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
|
|
||||||
def refine(self, entity: Table) -> None:
|
def refine(self, entity: Dict) -> None:
|
||||||
"""Aggregate data
|
"""Aggregate data
|
||||||
Returns:
|
Returns:
|
||||||
list:
|
list:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
for entity_fqn, cost_analysis_report_data in entity.items():
|
||||||
cost_analysis_data = RawCostAnalysisReportData(
|
try:
|
||||||
entity=self.metadata.get_entity_reference(
|
cost_analysis_data = RawCostAnalysisReportData(
|
||||||
entity=type(entity), fqn=entity.fullyQualifiedName
|
entity=EntityReference(
|
||||||
|
id=cost_analysis_report_data.entity.id,
|
||||||
|
fullyQualifiedName=model_str(
|
||||||
|
cost_analysis_report_data.entity.fullyQualifiedName
|
||||||
|
),
|
||||||
|
type=ENTITY_REFERENCE_TYPE_MAP[
|
||||||
|
type(cost_analysis_report_data.entity).__name__
|
||||||
|
],
|
||||||
|
),
|
||||||
|
lifeCycle=cost_analysis_report_data.life_cycle,
|
||||||
|
sizeInByte=cost_analysis_report_data.size,
|
||||||
)
|
)
|
||||||
)
|
self._refined_data[entity_fqn] = cost_analysis_data
|
||||||
if entity.lifeCycle:
|
self.processor_status.scanned(entity_fqn)
|
||||||
cost_analysis_data.lifeCycle = entity.lifeCycle
|
except Exception as err:
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
table_profile = self.metadata.get_latest_table_profile(
|
logger.error(f"Error trying fetch cost analysis data -- {err}")
|
||||||
fqn=entity.fullyQualifiedName
|
|
||||||
)
|
|
||||||
if table_profile.profile:
|
|
||||||
cost_analysis_data.sizeInByte = table_profile.profile.sizeInByte
|
|
||||||
|
|
||||||
if cost_analysis_data.lifeCycle or cost_analysis_data.sizeInByte:
|
|
||||||
self._refined_data[
|
|
||||||
entity.fullyQualifiedName.__root__
|
|
||||||
] = cost_analysis_data
|
|
||||||
|
|
||||||
self.processor_status.scanned(entity.name.__root__)
|
|
||||||
except Exception as err:
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
logger.error(f"Error trying fetch cost analysis data -- {err}")
|
|
||||||
|
|
||||||
def get_status(self):
|
def get_status(self):
|
||||||
return self.processor_status
|
return self.processor_status
|
||||||
@ -142,6 +140,7 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
|
|||||||
super().__init__(metadata)
|
super().__init__(metadata)
|
||||||
self._refined_data = defaultdict(lambda: defaultdict(dict))
|
self._refined_data = defaultdict(lambda: defaultdict(dict))
|
||||||
self.post_hook = self._post_hook_fn
|
self.post_hook = self._post_hook_fn
|
||||||
|
self.clean_up_cache = True
|
||||||
|
|
||||||
def yield_refined_data(self) -> Iterable[ReportData]:
|
def yield_refined_data(self) -> Iterable[ReportData]:
|
||||||
"""Yield refined data"""
|
"""Yield refined data"""
|
||||||
@ -152,27 +151,17 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
|
|||||||
data=data,
|
data=data,
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
|
|
||||||
def refine(self, entity: Table) -> None:
|
def refine(self, entity: Dict) -> None:
|
||||||
"""Aggregate data
|
"""Aggregate data
|
||||||
Returns:
|
Returns:
|
||||||
list:
|
list:
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
life_cycle = None
|
|
||||||
if entity.lifeCycle:
|
|
||||||
life_cycle = entity.lifeCycle
|
|
||||||
|
|
||||||
size = None
|
for entity_fqn, cost_analysis_report_data in entity.items():
|
||||||
table_profile = self.metadata.get_latest_table_profile(
|
entity_type = str(cost_analysis_report_data.entity.__class__.__name__)
|
||||||
fqn=entity.fullyQualifiedName
|
service_type = str(cost_analysis_report_data.entity.serviceType.name)
|
||||||
)
|
service_name = str(cost_analysis_report_data.entity.service.name)
|
||||||
if table_profile.profile:
|
|
||||||
size = table_profile.profile.sizeInByte
|
|
||||||
|
|
||||||
if life_cycle or size:
|
|
||||||
entity_type = str(entity.__class__.__name__)
|
|
||||||
service_type = str(entity.serviceType.name)
|
|
||||||
service_name = str(entity.service.name)
|
|
||||||
if not self._refined_data[str(entity_type)][service_type].get(
|
if not self._refined_data[str(entity_type)][service_type].get(
|
||||||
service_name
|
service_name
|
||||||
):
|
):
|
||||||
@ -185,18 +174,18 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
|
|||||||
else:
|
else:
|
||||||
self._refined_data[entity_type][service_type][service_name][
|
self._refined_data[entity_type][service_type][service_name][
|
||||||
TOTAL_SIZE
|
TOTAL_SIZE
|
||||||
] += (size or 0)
|
] += (cost_analysis_report_data.size or 0)
|
||||||
self._refined_data[entity_type][service_type][service_name][
|
self._refined_data[entity_type][service_type][service_name][
|
||||||
TOTAL_COUNT
|
TOTAL_COUNT
|
||||||
] += 1
|
] += 1
|
||||||
|
|
||||||
self._get_data_assets_dict(
|
self._get_data_assets_dict(
|
||||||
life_cycle=life_cycle,
|
life_cycle=cost_analysis_report_data.life_cycle,
|
||||||
size=size,
|
size=cost_analysis_report_data.size,
|
||||||
data=self._refined_data[entity_type][service_type][service_name],
|
data=self._refined_data[entity_type][service_type][service_name],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.processor_status.scanned(entity.name.__root__)
|
self.processor_status.scanned(entity_fqn)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
logger.error(f"Error trying fetch cost analysis data -- {err}")
|
logger.error(f"Error trying fetch cost analysis data -- {err}")
|
||||||
@ -247,7 +236,10 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
|
|||||||
# Iterate over the different time periods and update the data
|
# Iterate over the different time periods and update the data
|
||||||
for days, key in DAYS:
|
for days, key in DAYS:
|
||||||
days_before_timestamp = get_end_of_day_timestamp_mill(days=days)
|
days_before_timestamp = get_end_of_day_timestamp_mill(days=days)
|
||||||
if life_cycle.accessed.timestamp.__root__ <= days_before_timestamp:
|
if (
|
||||||
|
life_cycle.accessed
|
||||||
|
and life_cycle.accessed.timestamp.__root__ <= days_before_timestamp
|
||||||
|
):
|
||||||
data[UNUSED_DATA_ASSETS][COUNT][key] += 1
|
data[UNUSED_DATA_ASSETS][COUNT][key] += 1
|
||||||
data[UNUSED_DATA_ASSETS][SIZE][key] += size or 0
|
data[UNUSED_DATA_ASSETS][SIZE][key] += size or 0
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -47,6 +47,7 @@ class DataProcessor(abc.ABC):
|
|||||||
self._refined_data = {}
|
self._refined_data = {}
|
||||||
self.post_hook: Optional[Callable] = None
|
self.post_hook: Optional[Callable] = None
|
||||||
self.pre_hook: Optional[Callable] = None
|
self.pre_hook: Optional[Callable] = None
|
||||||
|
self.clean_up_cache: bool = False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, _data_processor_type, metadata: OpenMetadata):
|
def create(cls, _data_processor_type, metadata: OpenMetadata):
|
||||||
|
|||||||
@ -13,16 +13,31 @@ Producer class for data insight entity reports
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Iterable
|
from typing import Dict, Iterable, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from metadata.data_insight.producer.producer_interface import ProducerInterface
|
from metadata.data_insight.producer.producer_interface import ProducerInterface
|
||||||
from metadata.generated.schema.entity.data.table import Table
|
from metadata.generated.schema.entity.data.table import Table
|
||||||
from metadata.generated.schema.entity.services.databaseService import DatabaseService
|
from metadata.generated.schema.entity.services.databaseService import DatabaseService
|
||||||
|
from metadata.generated.schema.type.lifeCycle import LifeCycle
|
||||||
|
from metadata.ingestion.api.models import Entity
|
||||||
|
from metadata.ingestion.ometa.utils import model_str
|
||||||
from metadata.utils.logger import data_insight_logger
|
from metadata.utils.logger import data_insight_logger
|
||||||
|
|
||||||
logger = data_insight_logger()
|
logger = data_insight_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class CostAnalysisReportData(BaseModel):
|
||||||
|
"""
|
||||||
|
Query executed get life cycle
|
||||||
|
"""
|
||||||
|
|
||||||
|
entity: Entity
|
||||||
|
life_cycle: Optional[LifeCycle]
|
||||||
|
size: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
class CostAnalysisProducer(ProducerInterface):
|
class CostAnalysisProducer(ProducerInterface):
|
||||||
"""entity producer class"""
|
"""entity producer class"""
|
||||||
|
|
||||||
@ -36,27 +51,76 @@ class CostAnalysisProducer(ProducerInterface):
|
|||||||
and database_service.connection.config.supportsProfiler.__root__
|
and database_service.connection.config.supportsProfiler.__root__
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _check_life_cycle_and_size_data(
|
||||||
|
self, table: Table
|
||||||
|
) -> Optional[CostAnalysisReportData]:
|
||||||
|
"""
|
||||||
|
Method to check if the valid life cycle and table size data is present for the table
|
||||||
|
"""
|
||||||
|
cost_analysis_report_data = CostAnalysisReportData(entity=table)
|
||||||
|
if table.lifeCycle and table.lifeCycle.accessed:
|
||||||
|
cost_analysis_report_data.life_cycle = table.lifeCycle
|
||||||
|
|
||||||
|
table_profile = self.metadata.get_latest_table_profile(
|
||||||
|
fqn=table.fullyQualifiedName
|
||||||
|
)
|
||||||
|
if table_profile.profile:
|
||||||
|
cost_analysis_report_data.size = table_profile.profile.sizeInByte
|
||||||
|
|
||||||
|
if cost_analysis_report_data.life_cycle or cost_analysis_report_data.size:
|
||||||
|
return cost_analysis_report_data
|
||||||
|
return None
|
||||||
|
|
||||||
|
def life_cycle_data_dict(
|
||||||
|
self, entities_cache: Optional[Dict], database_service_fqn: str
|
||||||
|
) -> Iterable[Dict]:
|
||||||
|
"""
|
||||||
|
Cache the required lifecycle data to be used by the processors and return the dict
|
||||||
|
"""
|
||||||
|
if entities_cache.get(database_service_fqn):
|
||||||
|
yield entities_cache[database_service_fqn]
|
||||||
|
else:
|
||||||
|
tables = self.metadata.list_all_entities(
|
||||||
|
Table,
|
||||||
|
limit=100,
|
||||||
|
skip_on_failure=True,
|
||||||
|
params={"database": database_service_fqn},
|
||||||
|
)
|
||||||
|
entities_cache[database_service_fqn] = {}
|
||||||
|
|
||||||
|
for table in tables:
|
||||||
|
try:
|
||||||
|
cost_analysis_data = self._check_life_cycle_and_size_data(
|
||||||
|
table=table
|
||||||
|
)
|
||||||
|
if cost_analysis_data:
|
||||||
|
entities_cache[database_service_fqn][
|
||||||
|
model_str(table.fullyQualifiedName)
|
||||||
|
] = cost_analysis_data
|
||||||
|
except Exception as err:
|
||||||
|
logger.error(
|
||||||
|
f"Error trying to fetch cost analysis data for [{model_str(table.fullyQualifiedName)}] -- {err}"
|
||||||
|
)
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
|
yield entities_cache[database_service_fqn]
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
def fetch_data(self, limit=100, fields=["*"]) -> Iterable:
|
def fetch_data(
|
||||||
|
self, limit=100, fields=["*"], entities_cache=None
|
||||||
|
) -> Optional[Iterable[Dict]]:
|
||||||
database_services = self.metadata.list_all_entities(
|
database_services = self.metadata.list_all_entities(
|
||||||
DatabaseService, limit=limit, fields=fields, skip_on_failure=True
|
DatabaseService, limit=limit, fields=fields, skip_on_failure=True
|
||||||
)
|
)
|
||||||
entities_list = []
|
|
||||||
for database_service in database_services or []:
|
for database_service in database_services or []:
|
||||||
try:
|
try:
|
||||||
if self._check_profiler_and_usage_support(database_service):
|
if self._check_profiler_and_usage_support(database_service):
|
||||||
entities_list.extend(
|
yield from self.life_cycle_data_dict(
|
||||||
self.metadata.list_all_entities(
|
entities_cache=entities_cache,
|
||||||
Table,
|
database_service_fqn=model_str(
|
||||||
limit=limit,
|
database_service.fullyQualifiedName
|
||||||
fields=fields,
|
),
|
||||||
skip_on_failure=True,
|
|
||||||
params={
|
|
||||||
"database": database_service.fullyQualifiedName.__root__
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.error(f"Error trying to fetch entities -- {err}")
|
logger.error(f"Error trying to fetch entities -- {err}")
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
return entities_list
|
|
||||||
|
|||||||
@ -52,7 +52,7 @@ class EntityProducer(ProducerInterface):
|
|||||||
]
|
]
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
def fetch_data(self, limit=100, fields=["*"]) -> Iterable:
|
def fetch_data(self, limit=100, fields=["*"], entities_cache=None) -> Iterable:
|
||||||
for entity in self.entities:
|
for entity in self.entities:
|
||||||
try:
|
try:
|
||||||
yield from self.metadata.list_all_entities(
|
yield from self.metadata.list_all_entities(
|
||||||
|
|||||||
@ -24,6 +24,6 @@ class ProducerInterface(ABC):
|
|||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fetch_data(self, limit, fields):
|
def fetch_data(self, limit, fields, entities_cache=None):
|
||||||
"""fetch data from source"""
|
"""fetch data from source"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|||||||
@ -75,7 +75,7 @@ class WebAnalyticsProducer(ProducerInterface):
|
|||||||
CACHED_EVENTS.clear()
|
CACHED_EVENTS.clear()
|
||||||
|
|
||||||
def fetch_data(
|
def fetch_data(
|
||||||
self, limit=100, fields=["*"]
|
self, limit=100, fields=["*"], entities_cache=None
|
||||||
): # pylint: disable=dangerous-default-value
|
): # pylint: disable=dangerous-default-value
|
||||||
"""fetch data for web analytics event"""
|
"""fetch data for web analytics event"""
|
||||||
events = self._get_events(None, limit, fields)
|
events = self._get_events(None, limit, fields)
|
||||||
|
|||||||
@ -74,6 +74,7 @@ class DataInsightSource(Source):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
self.date = datetime.utcnow().strftime("%Y-%m-%d")
|
self.date = datetime.utcnow().strftime("%Y-%m-%d")
|
||||||
|
self.entities_cache = {}
|
||||||
|
|
||||||
_processors = self._instantiate_processors()
|
_processors = self._instantiate_processors()
|
||||||
self._processors: Dict[
|
self._processors: Dict[
|
||||||
@ -130,11 +131,19 @@ class DataInsightSource(Source):
|
|||||||
processor = cast(DataProcessor, processor)
|
processor = cast(DataProcessor, processor)
|
||||||
processor.pre_hook() if processor.pre_hook else None # pylint: disable=expression-not-assigned
|
processor.pre_hook() if processor.pre_hook else None # pylint: disable=expression-not-assigned
|
||||||
|
|
||||||
for data in producer.fetch_data(fields=["owner", "tags"]):
|
for data in (
|
||||||
|
producer.fetch_data(
|
||||||
|
fields=["owner", "tags"], entities_cache=self.entities_cache
|
||||||
|
)
|
||||||
|
or []
|
||||||
|
):
|
||||||
processor.refine(data)
|
processor.refine(data)
|
||||||
|
|
||||||
processor.post_hook() if processor.post_hook else None # pylint: disable=expression-not-assigned
|
processor.post_hook() if processor.post_hook else None # pylint: disable=expression-not-assigned
|
||||||
|
|
||||||
|
if processor.clean_up_cache:
|
||||||
|
self.entities_cache.clear()
|
||||||
|
|
||||||
for data in processor.yield_refined_data():
|
for data in processor.yield_refined_data():
|
||||||
yield Either(left=None, right=DataInsightRecord(data=data))
|
yield Either(left=None, right=DataInsightRecord(data=data))
|
||||||
except KeyError as key_error:
|
except KeyError as key_error:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user