mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-08-14 12:06:54 +00:00
GEN-1410 - Improve PII Logging information (#17835)
This commit is contained in:
parent
43d8c13709
commit
4e7b1381b1
@ -14,6 +14,7 @@ NER Scanner based on Presidio.
|
||||
Supported Entities https://microsoft.github.io/presidio/supported_entities/
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import traceback
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
@ -26,9 +27,11 @@ from metadata.pii.models import TagAndConfidence
|
||||
from metadata.pii.ner import NEREntity
|
||||
from metadata.pii.scanners.base import BaseScanner
|
||||
from metadata.utils import fqn
|
||||
from metadata.utils.logger import pii_logger
|
||||
from metadata.utils.logger import METADATA_LOGGER, pii_logger
|
||||
|
||||
logger = pii_logger()
|
||||
SUPPORTED_LANG = "en"
|
||||
PRESIDIO_LOGGER = "presidio-analyzer"
|
||||
|
||||
|
||||
class StringAnalysis(BaseModel):
|
||||
@ -40,11 +43,16 @@ class StringAnalysis(BaseModel):
|
||||
appearances: int
|
||||
|
||||
|
||||
class NLPEngineModel(BaseModel):
|
||||
"""Required to pass the nlp_engine as {"lang_code": "en", "model_name": "en_core_web_lg"}"""
|
||||
|
||||
lang_code: str
|
||||
model_name: str
|
||||
|
||||
|
||||
# pylint: disable=import-outside-toplevel
|
||||
class NERScanner(BaseScanner):
|
||||
"""
|
||||
Based on https://microsoft.github.io/presidio/
|
||||
"""
|
||||
"""Based on https://microsoft.github.io/presidio/"""
|
||||
|
||||
def __init__(self):
|
||||
import spacy
|
||||
@ -60,8 +68,19 @@ class NERScanner(BaseScanner):
|
||||
download(SPACY_EN_MODEL)
|
||||
spacy.load(SPACY_EN_MODEL)
|
||||
|
||||
nlp_engine_model = NLPEngineModel(
|
||||
lang_code=SUPPORTED_LANG, model_name=SPACY_EN_MODEL
|
||||
)
|
||||
|
||||
# Set the presidio logger to talk less about internal entities unless we are debugging
|
||||
logging.getLogger(PRESIDIO_LOGGER).setLevel(
|
||||
logging.INFO
|
||||
if logging.getLogger(METADATA_LOGGER).level == logging.DEBUG
|
||||
else logging.ERROR
|
||||
)
|
||||
|
||||
self.analyzer = AnalyzerEngine(
|
||||
nlp_engine=SpacyNlpEngine(models={"en": SPACY_EN_MODEL})
|
||||
nlp_engine=SpacyNlpEngine(models=[nlp_engine_model.model_dump()])
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -289,6 +289,7 @@ class BaseWorkflow(ABC, WorkflowStatusMixin):
|
||||
for step in self.workflow_steps():
|
||||
logger.info(
|
||||
f"{step.name}: Processed {len(step.status.records)} records,"
|
||||
f" updated {len(step.status.updated_records)} records,"
|
||||
f" filtered {len(step.status.filtered)} records,"
|
||||
f" found {len(step.status.failures)} errors"
|
||||
)
|
||||
|
@ -11,6 +11,11 @@
|
||||
"""
|
||||
Workflow definition for the profiler
|
||||
"""
|
||||
from typing import cast
|
||||
|
||||
from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import (
|
||||
DatabaseServiceProfilerPipeline,
|
||||
)
|
||||
from metadata.generated.schema.metadataIngestion.workflow import (
|
||||
OpenMetadataWorkflowConfig,
|
||||
)
|
||||
@ -58,9 +63,17 @@ class ProfilerWorkflow(IngestionWorkflow):
|
||||
self.source = source_class.create(self.config.model_dump(), self.metadata)
|
||||
|
||||
profiler_processor = self._get_profiler_processor()
|
||||
pii_processor = self._get_pii_processor()
|
||||
sink = self._get_sink()
|
||||
self.steps = (profiler_processor, pii_processor, sink)
|
||||
|
||||
# Only instantiate the PII Processor on demand
|
||||
source_config: DatabaseServiceProfilerPipeline = cast(
|
||||
DatabaseServiceProfilerPipeline, self.config.source.sourceConfig.config
|
||||
)
|
||||
if source_config.processPiiSensitive:
|
||||
pii_processor = self._get_pii_processor()
|
||||
self.steps = (profiler_processor, pii_processor, sink)
|
||||
else:
|
||||
self.steps = (profiler_processor, sink)
|
||||
|
||||
def test_connection(self):
|
||||
service_config = self.config.source.serviceConnection.root.config
|
||||
|
Loading…
x
Reference in New Issue
Block a user