Fix #13790: Cost analysis Data Insights Optimisations (#15147)

* Add optimizations

* optimise and add caching for DI: CA

* formatting

* Added cache cleanup
This commit is contained in:
Onkar Ravgan 2024-02-15 19:20:49 +05:30 committed by GitHub
parent 92fd428380
commit ecdcf517ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 130 additions and 64 deletions

View File

@ -18,7 +18,7 @@ from __future__ import annotations
import traceback import traceback
from collections import defaultdict from collections import defaultdict
from copy import deepcopy from copy import deepcopy
from typing import Iterable, Optional from typing import Dict, Iterable, Optional
from metadata.data_insight.processor.reports.data_processor import DataProcessor from metadata.data_insight.processor.reports.data_processor import DataProcessor
from metadata.generated.schema.analytics.reportData import ReportData, ReportDataType from metadata.generated.schema.analytics.reportData import ReportData, ReportDataType
@ -28,9 +28,11 @@ from metadata.generated.schema.analytics.reportDataType.aggregatedCostAnalysisRe
from metadata.generated.schema.analytics.reportDataType.rawCostAnalysisReportData import ( from metadata.generated.schema.analytics.reportDataType.rawCostAnalysisReportData import (
RawCostAnalysisReportData, RawCostAnalysisReportData,
) )
from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.type.entityReference import EntityReference
from metadata.generated.schema.type.lifeCycle import LifeCycle from metadata.generated.schema.type.lifeCycle import LifeCycle
from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.ometa.utils import model_str
from metadata.utils.constants import ENTITY_REFERENCE_TYPE_MAP
from metadata.utils.logger import data_insight_logger from metadata.utils.logger import data_insight_logger
from metadata.utils.time_utils import get_end_of_day_timestamp_mill from metadata.utils.time_utils import get_end_of_day_timestamp_mill
@ -98,36 +100,32 @@ class RawCostAnalysisReportDataProcessor(DataProcessor):
data=value, data=value,
) # type: ignore ) # type: ignore
def refine(self, entity: Table) -> None: def refine(self, entity: Dict) -> None:
"""Aggregate data """Aggregate data
Returns: Returns:
list: list:
""" """
try: for entity_fqn, cost_analysis_report_data in entity.items():
cost_analysis_data = RawCostAnalysisReportData( try:
entity=self.metadata.get_entity_reference( cost_analysis_data = RawCostAnalysisReportData(
entity=type(entity), fqn=entity.fullyQualifiedName entity=EntityReference(
id=cost_analysis_report_data.entity.id,
fullyQualifiedName=model_str(
cost_analysis_report_data.entity.fullyQualifiedName
),
type=ENTITY_REFERENCE_TYPE_MAP[
type(cost_analysis_report_data.entity).__name__
],
),
lifeCycle=cost_analysis_report_data.life_cycle,
sizeInByte=cost_analysis_report_data.size,
) )
) self._refined_data[entity_fqn] = cost_analysis_data
if entity.lifeCycle: self.processor_status.scanned(entity_fqn)
cost_analysis_data.lifeCycle = entity.lifeCycle except Exception as err:
logger.debug(traceback.format_exc())
table_profile = self.metadata.get_latest_table_profile( logger.error(f"Error trying fetch cost analysis data -- {err}")
fqn=entity.fullyQualifiedName
)
if table_profile.profile:
cost_analysis_data.sizeInByte = table_profile.profile.sizeInByte
if cost_analysis_data.lifeCycle or cost_analysis_data.sizeInByte:
self._refined_data[
entity.fullyQualifiedName.__root__
] = cost_analysis_data
self.processor_status.scanned(entity.name.__root__)
except Exception as err:
logger.debug(traceback.format_exc())
logger.error(f"Error trying fetch cost analysis data -- {err}")
def get_status(self): def get_status(self):
return self.processor_status return self.processor_status
@ -142,6 +140,7 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
super().__init__(metadata) super().__init__(metadata)
self._refined_data = defaultdict(lambda: defaultdict(dict)) self._refined_data = defaultdict(lambda: defaultdict(dict))
self.post_hook = self._post_hook_fn self.post_hook = self._post_hook_fn
self.clean_up_cache = True
def yield_refined_data(self) -> Iterable[ReportData]: def yield_refined_data(self) -> Iterable[ReportData]:
"""Yield refined data""" """Yield refined data"""
@ -152,27 +151,17 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
data=data, data=data,
) # type: ignore ) # type: ignore
def refine(self, entity: Table) -> None: def refine(self, entity: Dict) -> None:
"""Aggregate data """Aggregate data
Returns: Returns:
list: list:
""" """
try: try:
life_cycle = None
if entity.lifeCycle:
life_cycle = entity.lifeCycle
size = None for entity_fqn, cost_analysis_report_data in entity.items():
table_profile = self.metadata.get_latest_table_profile( entity_type = str(cost_analysis_report_data.entity.__class__.__name__)
fqn=entity.fullyQualifiedName service_type = str(cost_analysis_report_data.entity.serviceType.name)
) service_name = str(cost_analysis_report_data.entity.service.name)
if table_profile.profile:
size = table_profile.profile.sizeInByte
if life_cycle or size:
entity_type = str(entity.__class__.__name__)
service_type = str(entity.serviceType.name)
service_name = str(entity.service.name)
if not self._refined_data[str(entity_type)][service_type].get( if not self._refined_data[str(entity_type)][service_type].get(
service_name service_name
): ):
@ -185,18 +174,18 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
else: else:
self._refined_data[entity_type][service_type][service_name][ self._refined_data[entity_type][service_type][service_name][
TOTAL_SIZE TOTAL_SIZE
] += (size or 0) ] += (cost_analysis_report_data.size or 0)
self._refined_data[entity_type][service_type][service_name][ self._refined_data[entity_type][service_type][service_name][
TOTAL_COUNT TOTAL_COUNT
] += 1 ] += 1
self._get_data_assets_dict( self._get_data_assets_dict(
life_cycle=life_cycle, life_cycle=cost_analysis_report_data.life_cycle,
size=size, size=cost_analysis_report_data.size,
data=self._refined_data[entity_type][service_type][service_name], data=self._refined_data[entity_type][service_type][service_name],
) )
self.processor_status.scanned(entity.name.__root__) self.processor_status.scanned(entity_fqn)
except Exception as err: except Exception as err:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(f"Error trying fetch cost analysis data -- {err}") logger.error(f"Error trying fetch cost analysis data -- {err}")
@ -247,7 +236,10 @@ class AggregatedCostAnalysisReportDataProcessor(DataProcessor):
# Iterate over the different time periods and update the data # Iterate over the different time periods and update the data
for days, key in DAYS: for days, key in DAYS:
days_before_timestamp = get_end_of_day_timestamp_mill(days=days) days_before_timestamp = get_end_of_day_timestamp_mill(days=days)
if life_cycle.accessed.timestamp.__root__ <= days_before_timestamp: if (
life_cycle.accessed
and life_cycle.accessed.timestamp.__root__ <= days_before_timestamp
):
data[UNUSED_DATA_ASSETS][COUNT][key] += 1 data[UNUSED_DATA_ASSETS][COUNT][key] += 1
data[UNUSED_DATA_ASSETS][SIZE][key] += size or 0 data[UNUSED_DATA_ASSETS][SIZE][key] += size or 0
else: else:

View File

@ -47,6 +47,7 @@ class DataProcessor(abc.ABC):
self._refined_data = {} self._refined_data = {}
self.post_hook: Optional[Callable] = None self.post_hook: Optional[Callable] = None
self.pre_hook: Optional[Callable] = None self.pre_hook: Optional[Callable] = None
self.clean_up_cache: bool = False
@classmethod @classmethod
def create(cls, _data_processor_type, metadata: OpenMetadata): def create(cls, _data_processor_type, metadata: OpenMetadata):

View File

@ -13,16 +13,31 @@ Producer class for data insight entity reports
""" """
import traceback import traceback
from typing import Iterable from typing import Dict, Iterable, Optional
from pydantic import BaseModel
from metadata.data_insight.producer.producer_interface import ProducerInterface from metadata.data_insight.producer.producer_interface import ProducerInterface
from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.generated.schema.entity.services.databaseService import DatabaseService
from metadata.generated.schema.type.lifeCycle import LifeCycle
from metadata.ingestion.api.models import Entity
from metadata.ingestion.ometa.utils import model_str
from metadata.utils.logger import data_insight_logger from metadata.utils.logger import data_insight_logger
logger = data_insight_logger() logger = data_insight_logger()
class CostAnalysisReportData(BaseModel):
"""
Query executed get life cycle
"""
entity: Entity
life_cycle: Optional[LifeCycle]
size: Optional[float]
class CostAnalysisProducer(ProducerInterface): class CostAnalysisProducer(ProducerInterface):
"""entity producer class""" """entity producer class"""
@ -36,27 +51,76 @@ class CostAnalysisProducer(ProducerInterface):
and database_service.connection.config.supportsProfiler.__root__ and database_service.connection.config.supportsProfiler.__root__
) )
def _check_life_cycle_and_size_data(
self, table: Table
) -> Optional[CostAnalysisReportData]:
"""
Method to check if the valid life cycle and table size data is present for the table
"""
cost_analysis_report_data = CostAnalysisReportData(entity=table)
if table.lifeCycle and table.lifeCycle.accessed:
cost_analysis_report_data.life_cycle = table.lifeCycle
table_profile = self.metadata.get_latest_table_profile(
fqn=table.fullyQualifiedName
)
if table_profile.profile:
cost_analysis_report_data.size = table_profile.profile.sizeInByte
if cost_analysis_report_data.life_cycle or cost_analysis_report_data.size:
return cost_analysis_report_data
return None
def life_cycle_data_dict(
self, entities_cache: Optional[Dict], database_service_fqn: str
) -> Iterable[Dict]:
"""
Cache the required lifecycle data to be used by the processors and return the dict
"""
if entities_cache.get(database_service_fqn):
yield entities_cache[database_service_fqn]
else:
tables = self.metadata.list_all_entities(
Table,
limit=100,
skip_on_failure=True,
params={"database": database_service_fqn},
)
entities_cache[database_service_fqn] = {}
for table in tables:
try:
cost_analysis_data = self._check_life_cycle_and_size_data(
table=table
)
if cost_analysis_data:
entities_cache[database_service_fqn][
model_str(table.fullyQualifiedName)
] = cost_analysis_data
except Exception as err:
logger.error(
f"Error trying to fetch cost analysis data for [{model_str(table.fullyQualifiedName)}] -- {err}"
)
logger.debug(traceback.format_exc())
yield entities_cache[database_service_fqn]
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
def fetch_data(self, limit=100, fields=["*"]) -> Iterable: def fetch_data(
self, limit=100, fields=["*"], entities_cache=None
) -> Optional[Iterable[Dict]]:
database_services = self.metadata.list_all_entities( database_services = self.metadata.list_all_entities(
DatabaseService, limit=limit, fields=fields, skip_on_failure=True DatabaseService, limit=limit, fields=fields, skip_on_failure=True
) )
entities_list = []
for database_service in database_services or []: for database_service in database_services or []:
try: try:
if self._check_profiler_and_usage_support(database_service): if self._check_profiler_and_usage_support(database_service):
entities_list.extend( yield from self.life_cycle_data_dict(
self.metadata.list_all_entities( entities_cache=entities_cache,
Table, database_service_fqn=model_str(
limit=limit, database_service.fullyQualifiedName
fields=fields, ),
skip_on_failure=True,
params={
"database": database_service.fullyQualifiedName.__root__
},
)
) )
except Exception as err: except Exception as err:
logger.error(f"Error trying to fetch entities -- {err}") logger.error(f"Error trying to fetch entities -- {err}")
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
return entities_list

View File

@ -52,7 +52,7 @@ class EntityProducer(ProducerInterface):
] ]
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
def fetch_data(self, limit=100, fields=["*"]) -> Iterable: def fetch_data(self, limit=100, fields=["*"], entities_cache=None) -> Iterable:
for entity in self.entities: for entity in self.entities:
try: try:
yield from self.metadata.list_all_entities( yield from self.metadata.list_all_entities(

View File

@ -24,6 +24,6 @@ class ProducerInterface(ABC):
self.metadata = metadata self.metadata = metadata
@abstractmethod @abstractmethod
def fetch_data(self, limit, fields): def fetch_data(self, limit, fields, entities_cache=None):
"""fetch data from source""" """fetch data from source"""
raise NotImplementedError raise NotImplementedError

View File

@ -75,7 +75,7 @@ class WebAnalyticsProducer(ProducerInterface):
CACHED_EVENTS.clear() CACHED_EVENTS.clear()
def fetch_data( def fetch_data(
self, limit=100, fields=["*"] self, limit=100, fields=["*"], entities_cache=None
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
"""fetch data for web analytics event""" """fetch data for web analytics event"""
events = self._get_events(None, limit, fields) events = self._get_events(None, limit, fields)

View File

@ -74,6 +74,7 @@ class DataInsightSource(Source):
super().__init__() super().__init__()
self.metadata = metadata self.metadata = metadata
self.date = datetime.utcnow().strftime("%Y-%m-%d") self.date = datetime.utcnow().strftime("%Y-%m-%d")
self.entities_cache = {}
_processors = self._instantiate_processors() _processors = self._instantiate_processors()
self._processors: Dict[ self._processors: Dict[
@ -130,11 +131,19 @@ class DataInsightSource(Source):
processor = cast(DataProcessor, processor) processor = cast(DataProcessor, processor)
processor.pre_hook() if processor.pre_hook else None # pylint: disable=expression-not-assigned processor.pre_hook() if processor.pre_hook else None # pylint: disable=expression-not-assigned
for data in producer.fetch_data(fields=["owner", "tags"]): for data in (
producer.fetch_data(
fields=["owner", "tags"], entities_cache=self.entities_cache
)
or []
):
processor.refine(data) processor.refine(data)
processor.post_hook() if processor.post_hook else None # pylint: disable=expression-not-assigned processor.post_hook() if processor.post_hook else None # pylint: disable=expression-not-assigned
if processor.clean_up_cache:
self.entities_cache.clear()
for data in processor.yield_refined_data(): for data in processor.yield_refined_data():
yield Either(left=None, right=DataInsightRecord(data=data)) yield Either(left=None, right=DataInsightRecord(data=data))
except KeyError as key_error: except KeyError as key_error: