Restructure NER Scanner internals (#11690)

* Simplify col name scanner * Restructure NER Scanner internals
2025-12-06 12:34:16 +00:00 · 2023-05-19 18:21:01 +02:00 · 2023-05-19 18:21:01 +02:00 · 0eb2201f94
commit 0eb2201f94
parent fafbfdaeab
4 changed files with 140 additions and 100 deletions
--- a/ingestion/src/metadata/pii/column_name_scanner.py
+++ b/ingestion/src/metadata/pii/column_name_scanner.py
@ -12,72 +12,42 @@
 Regex scanner for column names
 """
 import re
 from enum import Enum, auto
 from typing import Optional
 from metadata.pii.models import TagAndConfidence, TagType
 class PiiTypes(Enum):
    """
    PiiTypes enumerates the different types of PII data
    """
    NONE = auto()
    UNSUPPORTED = auto()
    PHONE = auto()
    EMAIL = auto()
    CREDIT_CARD = auto()
    ADDRESS = auto()
    ADDRESS_LOCATION = auto()
    PERSON = auto()
    LOCATION = auto()
    BIRTH_DATE = auto()
    GENDER = auto()
    NATIONALITY = auto()
    IP_ADDRESS = auto()
    SSN = auto()
    USER_NAME = auto()
    PASSWORD = auto()
    ETHNICITY = auto()
    TAX_ID = auto()
    KEY = auto()
    BANKACC = auto()
 class ColumnNameScanner:
    """
    Column Name Scanner to scan column name
    """
    sensitive_regex = {
-        PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
+        "PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
-        PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
+        "SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
-        PiiTypes.CREDIT_CARD: re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
+        "CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
-        PiiTypes.BANKACC: re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
+        "BANKACC": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
-        PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
+        "EMAIL": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
-        PiiTypes.USER_NAME: re.compile(
+        "USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
-            "^.*(user|client|person).*(name).*$", re.IGNORECASE
+        "PERSON": re.compile(
        ),
        PiiTypes.PERSON: re.compile(
            "^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
            re.IGNORECASE,
        ),
    }
    non_sensitive_regex = {
-        PiiTypes.BIRTH_DATE: re.compile(
+        "BIRTH_DATE": re.compile(
            "^.*(date_of_birth|dateofbirth|dob|"
            "birthday|date_of_death|dateofdeath).*$",
            re.IGNORECASE,
        ),
-        PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
+        "GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
-        PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
+        "NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
-        PiiTypes.ADDRESS: re.compile(
+        "ADDRESS": re.compile(
            "^.*(address|city|state|county|country|"
            "zipcode|zip|postal|zone|borough).*$",
            re.IGNORECASE,
        ),
-        PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
+        "PHONE": re.compile("^.*(phone).*$", re.IGNORECASE),
    }
    @classmethod
--- a/ingestion/src/metadata/pii/ner_scanner.py
+++ b/ingestion/src/metadata/pii/ner_scanner.py
@ -14,8 +14,11 @@ NER Scanner based on Presidio.
 Supported Entities https://microsoft.github.io/presidio/supported_entities/
 """
 import traceback
 from collections import defaultdict
 from enum import Enum
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 from pydantic import BaseModel
 from metadata.pii import SPACY_EN_MODEL
 from metadata.pii.models import TagAndConfidence, TagType
@ -25,21 +28,58 @@ logger = pii_logger()
 class NEREntity(Enum):
    """
    PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
    """
    # Global
    CREDIT_CARD = TagType.SENSITIVE.value
    CRYPTO = TagType.SENSITIVE.value
    DATE_TIME = TagType.NONSENSITIVE.value
    EMAIL_ADDRESS = TagType.SENSITIVE.value
    IBAN_CODE = TagType.SENSITIVE.value
    IP_ADDRESS = TagType.SENSITIVE.value
    NRP = TagType.NONSENSITIVE.value
    LOCATION = TagType.NONSENSITIVE.value
    PERSON = TagType.SENSITIVE.value
    PHONE_NUMBER = TagType.NONSENSITIVE.value
    MEDICAL_LICENSE = TagType.SENSITIVE.value
    US_DRIVER_LICENSE = TagType.SENSITIVE.value
    DATE_TIME = TagType.NONSENSITIVE.value
    URL = TagType.NONSENSITIVE.value
    # USA
    US_BANK_NUMBER = TagType.SENSITIVE.value
-    US_SSN = TagType.SENSITIVE.value
+    US_DRIVER_LICENSE = TagType.SENSITIVE.value
-    PERSON = TagType.SENSITIVE.value
+    US_ITIN = TagType.SENSITIVE.value
    US_PASSPORT = TagType.SENSITIVE.value
    US_SSN = TagType.SENSITIVE.value
    # UK
    UK_NHS = TagType.SENSITIVE.value
    # Spain
    NIF = TagType.SENSITIVE.value
    # Italy
    IT_FISCAL_CODE = TagType.SENSITIVE.value
    IT_DRIVER_LICENSE = TagType.SENSITIVE.value
    IT_VAT_CODE = TagType.SENSITIVE.value
    IT_PASSPORT = TagType.SENSITIVE.value
    IT_IDENTITY_CARD = TagType.SENSITIVE.value
    # Australia
    AU_ABN = TagType.SENSITIVE.value
    AU_ACN = TagType.SENSITIVE.value
    AU_TFN = TagType.SENSITIVE.value
    AU_MEDICARE = TagType.SENSITIVE.value
 class StringAnalysis(BaseModel):
    """
    Used to store results from the sample data scans for each NER Entity
    """
    score: float
    appearances: int
 # pylint: disable=import-outside-toplevel
@ -68,46 +108,61 @@ class NERScanner:
    @staticmethod
    def get_highest_score_label(
-        labels_score, str_sample_data_rows: List[str]
+        entities_score: Dict[str, StringAnalysis]
-    ) -> Tuple[Optional[str], Optional[float]]:
+    ) -> Tuple[str, float]:
-        most_used_label_occurrence = 0
+        top_entity = max(
-        label_score = None
+            entities_score,
-        for label, score in labels_score.items():
+            key=lambda type_: entities_score[type_].score
-            if score[0] == 1.0 and score[1] > len(str_sample_data_rows) * 0.8:
+            * entities_score[type_].appearances
-                return (label, score[0])
+            * 0.8,
-            if score[1] > most_used_label_occurrence:
+        )
-                label_score = (label, score[0])
+        return top_entity, entities_score[top_entity].score
                most_used_label_occurrence = score[1]
        return label_score or (None, None)
    def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
        """
-        Scan the column's sample data rows and look for PII
+        Scan the column's sample data rows and look for PII.
        How this works:
        1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
        2. Then, for each s_i:
          a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
             For example, the result of analyzing `123456789` gives us
               [
                 type: DATE_TIME, start: 0, end: 9, score: 0.85,
                 type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
                 type: US_PASSPORT, start: 0, end: 9, score: 0.05,
                 type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
              ]
          b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
        3. After gathering all the results for each row, get the `Entity` with maximum overall score
           and number of appearances. This gets computed as "score * appearances * 0.8", which can
           be thought as the "score" times "weighted down appearances".
        4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
        """
        logger.debug("Processing '%s'", sample_data_rows)
-        labels_score = {}
+
        # Initialize an empty dict for the given row list
        entities_score: Dict[str, StringAnalysis] = defaultdict(
            lambda: StringAnalysis(score=0, appearances=0)
        )
        str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
        for row in str_sample_data_rows:
            try:
                results = self.analyzer.analyze(row, language="en")
                for result in results:
-                    logger.debug("Found %s", result.entity_type)
+                    entities_score[result.entity_type] = StringAnalysis(
-                    tag = result.entity_type
+                        score=result.score
-                    if tag in labels_score:
+                        if result.score > entities_score[result.entity_type].score
-                        labels_score[tag] = (
+                        else entities_score[result.entity_type].score,
-                            result.score
+                        appearances=entities_score[result.entity_type].appearances + 1,
-                            if result.score > labels_score[tag][0]
+                    )
                            else labels_score[tag][0],
                            labels_score[tag][1] + 1,
                        )
                    else:
                        labels_score[tag] = (result.score, 1)
            except Exception as exc:
                logger.warning(f"Unknown error while processing {row} - {exc}")
                logger.debug(traceback.format_exc())
-        label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
+        if entities_score:
-        if label and score:
+            label, score = self.get_highest_score_label(entities_score)
            tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
            return TagAndConfidence(tag=tag_type, confidence=score)
--- a/ingestion/src/metadata/pii/processor.py
+++ b/ingestion/src/metadata/pii/processor.py
@ -69,31 +69,34 @@ class PIIProcessor:
        """
        for idx, column in enumerate(table_entity.columns):
-            # First, check if the column we are about to process
+            try:
-            # already has PII tags or not
+                # First, check if the column we are about to process
-            column_has_pii_tag = any(
+                # already has PII tags or not
-                (PII in tag.tagFQN.__root__ for tag in column.tags or [])
+                column_has_pii_tag = any(
-            )
+                    (PII in tag.tagFQN.__root__ for tag in column.tags or [])
            # If it has PII tags, we skip the processing
            # for the column
            if column_has_pii_tag is True:
                continue
            # Scan by column name. If no results there, check the sample data, if any
            tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
                self.ner_scanner.scan([row[idx] for row in table_data.rows])
                if table_data
                else None
            )
            if (
                tag_and_confidence
                and tag_and_confidence.tag
                and tag_and_confidence.confidence >= confidence_threshold / 100
            ):
                self.patch_column_tag(
                    tag_type=tag_and_confidence.tag.value,
                    table_entity=table_entity,
                    column_name=table_entity.columns[idx].name.__root__,
                )
                # If it has PII tags, we skip the processing
                # for the column
                if column_has_pii_tag is True:
                    continue
                # Scan by column name. If no results there, check the sample data, if any
                tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
                    self.ner_scanner.scan([row[idx] for row in table_data.rows])
                    if table_data
                    else None
                )
                if (
                    tag_and_confidence
                    and tag_and_confidence.tag
                    and tag_and_confidence.confidence >= confidence_threshold / 100
                ):
                    self.patch_column_tag(
                        tag_type=tag_and_confidence.tag.value,
                        table_entity=table_entity,
                        column_name=table_entity.columns[idx].name.__root__,
                    )
            except Exception as err:
                logger.warning(f"Error computing PII tags for [{column}] - [{err}]")
--- a/ingestion/tests/unit/pii/test_ner_scanner.py
+++ b/ingestion/tests/unit/pii/test_ner_scanner.py
@ -51,3 +51,15 @@ class NERScannerTest(TestCase):
            ).tag,
            TagType.SENSITIVE,
        )
    def test_scanner_nonsensitive(self):
        self.assertEqual(
            self.ner_scanner.scan(
                [
                    "Washington",
                    "Alaska",
                    "Netherfield Lea Street",
                ]
            ).tag,
            TagType.NONSENSITIVE,
        )