Pere Miquel Brull c742835766
Auto Tagger Application - Preparing the Ingestion Framework (#13862)
* Prepare the skeleton for generic app registration

* Prepare the skeleton for generic app registration

* Handle app runner

* Prepare the skeleton for generic app registration

* Prepare the skeleton for generic app registration

* Allow deployment

* Fix PII APP

* Fix lint

* Fix PII APP

* Fix PII APP

* Prepare config-based external apps

* Prepare config-based external apps

* Fix lint

* Prepare config-based external apps

* Fix DI errors

* Amend comments
2023-11-13 08:58:38 +01:00

79 lines
3.1 KiB
Python

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Workflow definition for the data insight
"""
from metadata.data_insight.processor.kpi.kpi_runner import KpiRunner
from metadata.data_insight.source.metadata import DataInsightSource
from metadata.generated.schema.analytics.basic import WebAnalyticEventType
from metadata.generated.schema.tests.testSuite import ServiceType
from metadata.ingestion.api.steps import Sink
from metadata.utils.importer import import_sink_class
from metadata.utils.logger import data_insight_logger
from metadata.utils.time_utils import get_beginning_of_day_timestamp_mill
from metadata.workflow.ingestion import IngestionWorkflow
logger = data_insight_logger()
class DataInsightWorkflow(IngestionWorkflow):
"""Data insight ingestion workflow implementation"""
retention_days = 7
def _retrieve_service_connection_if_needed(self, service_type: ServiceType) -> None:
"""No service connection needed for data insight"""
return None
def _run_kpi_processor(self) -> None:
"""Run kpi processor. It will run as a step but as a post execution.
We need the source to be executed first to compute the KPI results"""
kpi_runner = KpiRunner(self.metadata)
for kpi_result in kpi_runner.run():
# Pick up the sink
self.steps[0].run(kpi_result)
def _clean_up_web_analytics_events(self) -> None:
"""
We will delete web analytics events older than `cls.retention_days`
to prevent storage explosion.
TODO: deprecate to implement it with backend workflows
"""
tmsp = get_beginning_of_day_timestamp_mill(days=self.retention_days)
for web_analytic_event in WebAnalyticEventType:
self.metadata.delete_web_analytic_event_before_ts_exclusive(
web_analytic_event,
tmsp,
)
def _get_sink(self) -> Sink:
"""Retrieve sink for data insight workflow"""
sink_type = "metadata-rest"
sink_class = import_sink_class(sink_type=sink_type)
sink_config = {"api_endpoint": self.metadata_config.hostPort}
sink: Sink = sink_class.create(sink_config, self.metadata)
logger.debug(f"Sink type:{self.config.sink.type}, {sink_class} configured")
return sink
def execute_internal(self):
"""Use parent logic and add step to process KPIs"""
super().execute_internal()
self._run_kpi_processor()
self._clean_up_web_analytics_events()
def set_steps(self):
self.source = DataInsightSource.create(self.metadata) # type: ignore
sink = self._get_sink()
self.steps = (sink,)