Restructure NER Scanner internals (#11690)

* Simplify col name scanner * Restructure NER Scanner internals
2025-12-03 19:16:10 +00:00 · 2023-05-19 18:21:01 +02:00 · 2023-05-19 18:21:01 +02:00 · 0eb2201f94
commit 0eb2201f94
parent fafbfdaeab
4 changed files with 140 additions and 100 deletions
--- a/ingestion/src/metadata/pii/column_name_scanner.py
+++ b/ingestion/src/metadata/pii/column_name_scanner.py
@ -12,72 +12,42 @@
 Regex scanner for column names
 """
 import re
-from enum import Enum, auto
 from typing import Optional

 from metadata.pii.models import TagAndConfidence, TagType


-class PiiTypes(Enum):
-    """
-    PiiTypes enumerates the different types of PII data
-    """
-
-    NONE = auto()
-    UNSUPPORTED = auto()
-    PHONE = auto()
-    EMAIL = auto()
-    CREDIT_CARD = auto()
-    ADDRESS = auto()
-    ADDRESS_LOCATION = auto()
-    PERSON = auto()
-    LOCATION = auto()
-    BIRTH_DATE = auto()
-    GENDER = auto()
-    NATIONALITY = auto()
-    IP_ADDRESS = auto()
-    SSN = auto()
-    USER_NAME = auto()
-    PASSWORD = auto()
-    ETHNICITY = auto()
-    TAX_ID = auto()
-    KEY = auto()
-    BANKACC = auto()
-
-
 class ColumnNameScanner:
    """
    Column Name Scanner to scan column name
    """

    sensitive_regex = {
-        PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
-        PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
-        PiiTypes.CREDIT_CARD: re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
-        PiiTypes.BANKACC: re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
-        PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
-        PiiTypes.USER_NAME: re.compile(
-            "^.*(user|client|person).*(name).*$", re.IGNORECASE
-        ),
-        PiiTypes.PERSON: re.compile(
+        "PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
+        "SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
+        "CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
+        "BANKACC": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
+        "EMAIL": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
+        "USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
+        "PERSON": re.compile(
            "^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
            re.IGNORECASE,
        ),
    }
    non_sensitive_regex = {
-        PiiTypes.BIRTH_DATE: re.compile(
+        "BIRTH_DATE": re.compile(
            "^.*(date_of_birth|dateofbirth|dob|"
            "birthday|date_of_death|dateofdeath).*$",
            re.IGNORECASE,
        ),
-        PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
-        PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
-        PiiTypes.ADDRESS: re.compile(
+        "GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
+        "NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
+        "ADDRESS": re.compile(
            "^.*(address|city|state|county|country|"
            "zipcode|zip|postal|zone|borough).*$",
            re.IGNORECASE,
        ),
-        PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
+        "PHONE": re.compile("^.*(phone).*$", re.IGNORECASE),
    }

    @classmethod
--- a/ingestion/src/metadata/pii/ner_scanner.py
+++ b/ingestion/src/metadata/pii/ner_scanner.py
@ -14,8 +14,11 @@ NER Scanner based on Presidio.
 Supported Entities https://microsoft.github.io/presidio/supported_entities/
 """
 import traceback
+from collections import defaultdict
 from enum import Enum
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
+
+from pydantic import BaseModel

 from metadata.pii import SPACY_EN_MODEL
 from metadata.pii.models import TagAndConfidence, TagType
@ -25,21 +28,58 @@ logger = pii_logger()


 class NEREntity(Enum):
+    """
+    PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
+    """
+
+    # Global
    CREDIT_CARD = TagType.SENSITIVE.value
+    CRYPTO = TagType.SENSITIVE.value
+    DATE_TIME = TagType.NONSENSITIVE.value
    EMAIL_ADDRESS = TagType.SENSITIVE.value
    IBAN_CODE = TagType.SENSITIVE.value
    IP_ADDRESS = TagType.SENSITIVE.value
    NRP = TagType.NONSENSITIVE.value
    LOCATION = TagType.NONSENSITIVE.value
+    PERSON = TagType.SENSITIVE.value
    PHONE_NUMBER = TagType.NONSENSITIVE.value
    MEDICAL_LICENSE = TagType.SENSITIVE.value
-    US_DRIVER_LICENSE = TagType.SENSITIVE.value
-    DATE_TIME = TagType.NONSENSITIVE.value
    URL = TagType.NONSENSITIVE.value
+
+    # USA
    US_BANK_NUMBER = TagType.SENSITIVE.value
-    US_SSN = TagType.SENSITIVE.value
-    PERSON = TagType.SENSITIVE.value
+    US_DRIVER_LICENSE = TagType.SENSITIVE.value
+    US_ITIN = TagType.SENSITIVE.value
    US_PASSPORT = TagType.SENSITIVE.value
+    US_SSN = TagType.SENSITIVE.value
+
+    # UK
+    UK_NHS = TagType.SENSITIVE.value
+
+    # Spain
+    NIF = TagType.SENSITIVE.value
+
+    # Italy
+    IT_FISCAL_CODE = TagType.SENSITIVE.value
+    IT_DRIVER_LICENSE = TagType.SENSITIVE.value
+    IT_VAT_CODE = TagType.SENSITIVE.value
+    IT_PASSPORT = TagType.SENSITIVE.value
+    IT_IDENTITY_CARD = TagType.SENSITIVE.value
+
+    # Australia
+    AU_ABN = TagType.SENSITIVE.value
+    AU_ACN = TagType.SENSITIVE.value
+    AU_TFN = TagType.SENSITIVE.value
+    AU_MEDICARE = TagType.SENSITIVE.value
+
+
+class StringAnalysis(BaseModel):
+    """
+    Used to store results from the sample data scans for each NER Entity
+    """
+
+    score: float
+    appearances: int


 # pylint: disable=import-outside-toplevel
@ -68,46 +108,61 @@ class NERScanner:

    @staticmethod
    def get_highest_score_label(
-        labels_score, str_sample_data_rows: List[str]
-    ) -> Tuple[Optional[str], Optional[float]]:
-        most_used_label_occurrence = 0
-        label_score = None
-        for label, score in labels_score.items():
-            if score[0] == 1.0 and score[1] > len(str_sample_data_rows) * 0.8:
-                return (label, score[0])
-            if score[1] > most_used_label_occurrence:
-                label_score = (label, score[0])
-                most_used_label_occurrence = score[1]
-        return label_score or (None, None)
+        entities_score: Dict[str, StringAnalysis]
+    ) -> Tuple[str, float]:
+        top_entity = max(
+            entities_score,
+            key=lambda type_: entities_score[type_].score
+            * entities_score[type_].appearances
+            * 0.8,
+        )
+        return top_entity, entities_score[top_entity].score

    def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
        """
-        Scan the column's sample data rows and look for PII
+        Scan the column's sample data rows and look for PII.
+
+        How this works:
+        1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
+        2. Then, for each s_i:
+          a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
+             For example, the result of analyzing `123456789` gives us
+               [
+                 type: DATE_TIME, start: 0, end: 9, score: 0.85,
+                 type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
+                 type: US_PASSPORT, start: 0, end: 9, score: 0.05,
+                 type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
+              ]
+          b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
+        3. After gathering all the results for each row, get the `Entity` with maximum overall score
+           and number of appearances. This gets computed as "score * appearances * 0.8", which can
+           be thought as the "score" times "weighted down appearances".
+        4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
        """
        logger.debug("Processing '%s'", sample_data_rows)
-        labels_score = {}
+
+        # Initialize an empty dict for the given row list
+        entities_score: Dict[str, StringAnalysis] = defaultdict(
+            lambda: StringAnalysis(score=0, appearances=0)
+        )
+
        str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
        for row in str_sample_data_rows:
            try:
                results = self.analyzer.analyze(row, language="en")
                for result in results:
-                    logger.debug("Found %s", result.entity_type)
-                    tag = result.entity_type
-                    if tag in labels_score:
-                        labels_score[tag] = (
-                            result.score
-                            if result.score > labels_score[tag][0]
-                            else labels_score[tag][0],
-                            labels_score[tag][1] + 1,
+                    entities_score[result.entity_type] = StringAnalysis(
+                        score=result.score
+                        if result.score > entities_score[result.entity_type].score
+                        else entities_score[result.entity_type].score,
+                        appearances=entities_score[result.entity_type].appearances + 1,
                    )
-                    else:
-                        labels_score[tag] = (result.score, 1)
            except Exception as exc:
                logger.warning(f"Unknown error while processing {row} - {exc}")
                logger.debug(traceback.format_exc())

-        label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
-        if label and score:
+        if entities_score:
+            label, score = self.get_highest_score_label(entities_score)
            tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
            return TagAndConfidence(tag=tag_type, confidence=score)

--- a/ingestion/src/metadata/pii/processor.py
+++ b/ingestion/src/metadata/pii/processor.py
@ -69,6 +69,7 @@ class PIIProcessor:
        """
        for idx, column in enumerate(table_entity.columns):

+            try:
                # First, check if the column we are about to process
                # already has PII tags or not
                column_has_pii_tag = any(
@ -97,3 +98,5 @@ class PIIProcessor:
                        table_entity=table_entity,
                        column_name=table_entity.columns[idx].name.__root__,
                    )
+            except Exception as err:
+                logger.warning(f"Error computing PII tags for [{column}] - [{err}]")
--- a/ingestion/tests/unit/pii/test_ner_scanner.py
+++ b/ingestion/tests/unit/pii/test_ner_scanner.py
@ -51,3 +51,15 @@ class NERScannerTest(TestCase):
            ).tag,
            TagType.SENSITIVE,
        )
+
+    def test_scanner_nonsensitive(self):
+        self.assertEqual(
+            self.ner_scanner.scan(
+                [
+                    "Washington",
+                    "Alaska",
+                    "Netherfield Lea Street",
+                ]
+            ).tag,
+            TagType.NONSENSITIVE,
+        )