diff --git a/ingestion/src/metadata/pii/algorithms/classifiers.py b/ingestion/src/metadata/pii/algorithms/classifiers.py index dd97193d888..2e9112ddada 100644 --- a/ingestion/src/metadata/pii/algorithms/classifiers.py +++ b/ingestion/src/metadata/pii/algorithms/classifiers.py @@ -73,9 +73,6 @@ class ColumnClassifier(ABC, Generic[T]): """ -# Implementations - - @final class HeuristicPIIClassifier(ColumnClassifier[PIITag]): """ diff --git a/ingestion/src/metadata/pii/algorithms/presidio_utils.py b/ingestion/src/metadata/pii/algorithms/presidio_utils.py index cfb7f3eeb43..daf587f8522 100644 --- a/ingestion/src/metadata/pii/algorithms/presidio_utils.py +++ b/ingestion/src/metadata/pii/algorithms/presidio_utils.py @@ -13,7 +13,7 @@ Utilities for working with the Presidio Library. """ import inspect import logging -from typing import Iterable, Optional, Type +from typing import Iterable, Type, Union import spacy from presidio_analyzer import ( @@ -26,7 +26,7 @@ from presidio_analyzer.nlp_engine import SpacyNlpEngine from spacy.cli.download import download # pyright: ignore[reportUnknownVariableType] from metadata.pii.constants import PRESIDIO_LOGGER, SPACY_EN_MODEL, SUPPORTED_LANG -from metadata.utils.logger import METADATA_LOGGER, pii_logger +from metadata.utils.logger import pii_logger logger = pii_logger() @@ -64,17 +64,10 @@ def build_analyzer_engine( return analyzer_engine -def set_presidio_logger_level(log_level: Optional[int] = None) -> None: +def set_presidio_logger_level(log_level: Union[int, str] = logging.ERROR) -> None: """ Set the presidio logger to talk less about internal entities unless we are debugging. """ - if log_level is None: - log_level = ( - logging.INFO - if logging.getLogger(METADATA_LOGGER).level == logging.DEBUG - else logging.ERROR - ) - logging.getLogger(PRESIDIO_LOGGER).setLevel(log_level) @@ -87,7 +80,6 @@ def _load_spacy_model(model_name: str) -> None: try: _ = spacy.load(model_name) except OSError: - logger.warning(f"Downloading {model_name} language model for the spaCy") download(model_name) _ = spacy.load(model_name) diff --git a/ingestion/src/metadata/pii/scanners/ner_scanner.py b/ingestion/src/metadata/pii/scanners/ner_scanner.py index 58e0ef84b32..da189cbae50 100644 --- a/ingestion/src/metadata/pii/scanners/ner_scanner.py +++ b/ingestion/src/metadata/pii/scanners/ner_scanner.py @@ -22,6 +22,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from pydantic import BaseModel, ConfigDict from metadata.generated.schema.entity.classification.tag import Tag +from metadata.pii.algorithms.presidio_utils import _load_spacy_model from metadata.pii.constants import PII, SPACY_EN_MODEL from metadata.pii.models import TagAndConfidence from metadata.pii.ner import NEREntity @@ -56,18 +57,10 @@ class NERScanner(BaseScanner): """Based on https://microsoft.github.io/presidio/""" def __init__(self): - import spacy from presidio_analyzer import AnalyzerEngine from presidio_analyzer.nlp_engine.spacy_nlp_engine import SpacyNlpEngine - try: - spacy.load(SPACY_EN_MODEL) - except OSError: - logger.warning("Downloading en_core_web_md language model for the spaCy") - from spacy.cli import download - - download(SPACY_EN_MODEL) - spacy.load(SPACY_EN_MODEL) + _load_spacy_model(SPACY_EN_MODEL) nlp_engine_model = NLPEngineModel( lang_code=SUPPORTED_LANG, model_name=SPACY_EN_MODEL