Fixes #10911 - Improve Data Insight Logging (#10915)

This commit is contained in:
Teddy 2023-04-05 18:46:39 +02:00 committed by GitHub
parent 8ac6ce9f7a
commit 4683bee91a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 65 additions and 22 deletions

View File

@ -52,10 +52,13 @@ from metadata.generated.schema.metadataIngestion.workflow import (
)
from metadata.ingestion.api.parser import parse_workflow_config_gracefully
from metadata.ingestion.api.processor import ProcessorStatus
from metadata.ingestion.api.workflow import REPORTS_INTERVAL_SECONDS
from metadata.ingestion.ometa.ometa_api import EntityList, OpenMetadata
from metadata.ingestion.sink.elasticsearch import ElasticsearchSink
from metadata.timer.repeated_timer import RepeatedTimer
from metadata.timer.workflow_reporter import get_ingestion_status_timer
from metadata.utils.importer import get_sink
from metadata.utils.logger import data_insight_logger
from metadata.utils.logger import data_insight_logger, set_loggers_level
from metadata.utils.time_utils import (
get_beginning_of_day_timestamp_mill,
get_end_of_day_timestamp_mill,
@ -78,6 +81,10 @@ class DataInsightWorkflow(WorkflowStatusMixin):
def __init__(self, config: OpenMetadataWorkflowConfig) -> None:
self.config = config
self._timer: Optional[RepeatedTimer] = None
set_loggers_level(config.workflowConfig.loggerLevel.value)
self.metadata_config: OpenMetadataConnection = (
self.config.workflowConfig.openMetadataServerConfig
)
@ -85,7 +92,7 @@ class DataInsightWorkflow(WorkflowStatusMixin):
self.set_ingestion_pipeline_status(state=PipelineState.running)
self.status = ProcessorStatus()
self.data_processor: Optional[
self.source: Optional[
Union[
DataProcessor,
EntityReportDataProcessor,
@ -113,6 +120,16 @@ class DataInsightWorkflow(WorkflowStatusMixin):
self.es_sink = cast(ElasticsearchSink, self.es_sink)
@property
def timer(self) -> RepeatedTimer:
"""Status timer"""
if not self._timer:
self._timer = get_ingestion_status_timer(
interval=REPORTS_INTERVAL_SECONDS, logger=logger, workflow=self
)
return self._timer
@staticmethod
def _is_kpi_active(entity: Kpi) -> bool:
"""Check if a KPI is active
@ -197,10 +214,10 @@ class DataInsightWorkflow(WorkflowStatusMixin):
has_checked_and_handled_existing_es_data = False
logger.info(f"Processing data for report type {report_data_type}")
try:
self.data_processor = DataProcessor.create(
self.source = DataProcessor.create(
_data_processor_type=report_data_type.value, metadata=self.metadata
)
for record in self.data_processor.process():
for record in self.source.process():
if hasattr(self, "sink"):
self.sink.write_record(record)
if hasattr(self, "es_sink"):
@ -214,6 +231,9 @@ class DataInsightWorkflow(WorkflowStatusMixin):
logger.warning(
"No sink attribute found, skipping ingestion of KPI result"
)
self.status.records.extend(self.source.processor_status.records)
self.status.failures.extend(self.source.processor_status.failures)
self.status.warnings.extend(self.source.processor_status.warnings)
except Exception as exc:
error = f"Error while executing data insight workflow for report type {report_data_type}: {exc}"
@ -271,6 +291,8 @@ class DataInsightWorkflow(WorkflowStatusMixin):
def execute(self):
"""Execute workflow"""
self.timer.trigger()
try:
logger.info("Starting data processor execution")
self._execute_data_processor()
@ -291,21 +313,23 @@ class DataInsightWorkflow(WorkflowStatusMixin):
except Exception as err:
self.set_ingestion_pipeline_status(PipelineState.failed)
raise err
finally:
self.stop()
def _raise_from_status_internal(self, raise_warnings=False):
if self.data_processor and self.data_processor.get_status().failures:
if self.source and self.source.get_status().failures:
raise WorkflowExecutionError(
"Source reported errors", self.data_processor.get_status()
"Source reported errors", self.source.get_status()
)
if hasattr(self, "sink") and self.sink.get_status().failures:
raise WorkflowExecutionError("Sink reported errors", self.sink.get_status())
if raise_warnings and (
(self.data_processor and self.data_processor.get_status().warnings)
(self.source and self.source.get_status().warnings)
or self.sink.get_status().warnings
):
raise WorkflowExecutionError(
"Source reported warnings",
self.data_processor.get_status() if self.data_processor else None,
self.source.get_status() if self.source else None,
)
def print_status(self) -> None:
@ -316,7 +340,7 @@ class DataInsightWorkflow(WorkflowStatusMixin):
Returns 1 if status is failed, 0 otherwise.
"""
if (
(self.data_processor and self.data_processor.get_status().failures)
(self.source and self.source.get_status().failures)
or self.status.failures
or (hasattr(self, "sink") and self.sink.get_status().failures)
):
@ -328,3 +352,4 @@ class DataInsightWorkflow(WorkflowStatusMixin):
Close all connections
"""
self.metadata.close()
self.timer.stop()

View File

@ -171,11 +171,19 @@ class EntityReportDataProcessor(DataProcessor):
refined_data = defaultdict(lambda: defaultdict(dict))
for entity in self.fetch_data():
data_blob_for_entity = {}
team = (
self._get_team(entity.owner)
if not isinstance(entity, User)
else self._get_team(entity.teams)
)
try:
team = (
self._get_team(entity.owner)
if not isinstance(entity, User)
else self._get_team(entity.teams)
)
except Exception:
logger.debug(traceback.format_exc())
self.processor_status.failed(
entity.name.__root__, "Error retrieving team"
)
continue
try:
entity_tier = get_entity_tier_from_tags(entity.tags)
except AttributeError:
@ -230,6 +238,8 @@ class EntityReportDataProcessor(DataProcessor):
str(entity_tier)
].update(data_blob_for_entity_counter)
self.processor_status.scanned(entity.name.__root__)
return refined_data
def process(self) -> Iterable[ReportData]:

View File

@ -195,6 +195,8 @@ class WebAnalyticEntityViewReportDataProcessor(DataProcessor):
else:
refined_data[split_url[1]]["views"] += 1
self.processor_status.scanned(ENTITIES[entity_type].__name__)
def refine(self):
"""Aggregates data. It will return a dictionary of the following shape
@ -325,6 +327,8 @@ class WebAnalyticUserActivityReportDataProcessor(DataProcessor):
if timestamp > user_data["lastSession"]:
user_data["lastSession"] = timestamp
self.processor_status.scanned(user_id)
def fetch_data(self) -> Iterable[WebAnalyticEventData]:
if CACHED_EVENTS:
for event in CACHED_EVENTS:

View File

@ -66,8 +66,8 @@ class MetadataRestSink(Sink[Entity]):
try:
if isinstance(record, ReportData):
self.metadata.add_data_insight_report_data(record)
logger.info(
"Successfully ingested data insight for"
logger.debug(
"Successfully ingested data insight for "
f"{record.data.__class__.__name__ if record.data else 'Unknown'}"
)
self.status.records_written(
@ -75,7 +75,7 @@ class MetadataRestSink(Sink[Entity]):
)
if isinstance(record, KpiResult):
self.metadata.add_kpi_result(fqn=record.kpiFqn.__root__, record=record)
logger.info(f"Successfully ingested KPI for {record.kpiFqn}")
logger.debug(f"Successfully ingested KPI for {record.kpiFqn}")
self.status.records_written(f"Data Insight: {record.kpiFqn}")
except APIError as err:

View File

@ -342,7 +342,11 @@ class ElasticsearchSink(Sink[Entity]):
try:
self._write_record(record)
self.status.records_written(record.name.__root__)
self.status.records_written(
record.name.__root__
if hasattr(record, "name")
else type(record).__name__
)
except Exception as exc:
logger.debug(traceback.format_exc())

View File

@ -278,18 +278,18 @@ def print_data_insight_status(workflow) -> None:
print_workflow_summary(
workflow,
processor=True,
processor_status=workflow.data_processor.get_status(),
processor_status=workflow.status,
)
if workflow.data_processor.get_status().source_start_time:
if workflow.source.get_status().source_start_time:
log_ansi_encoded_string(
message=f"Workflow finished in time {pretty_print_time_duration(time.time()-workflow.data_processor.get_status().source_start_time)} ", # pylint: disable=line-too-long
message=f"Workflow finished in time {pretty_print_time_duration(time.time()-workflow.source.get_status().source_start_time)} ", # pylint: disable=line-too-long
)
if workflow.result_status() == 1:
log_ansi_encoded_string(message=WORKFLOW_FAILURE_MESSAGE)
elif (
workflow.data_processor.get_status().warnings
workflow.source.get_status().warnings
or workflow.status.warnings
or (hasattr(workflow, "sink") and workflow.sink.get_status().warnings)
):