Restructure NER Scanner internals (#11690)

* Simplify col name scanner

* Restructure NER Scanner internals
This commit is contained in:
Pere Miquel Brull 2023-05-19 18:21:01 +02:00 committed by GitHub
parent fafbfdaeab
commit 0eb2201f94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 140 additions and 100 deletions

View File

@ -12,72 +12,42 @@
Regex scanner for column names
"""
import re
from enum import Enum, auto
from typing import Optional
from metadata.pii.models import TagAndConfidence, TagType
class PiiTypes(Enum):
"""
PiiTypes enumerates the different types of PII data
"""
NONE = auto()
UNSUPPORTED = auto()
PHONE = auto()
EMAIL = auto()
CREDIT_CARD = auto()
ADDRESS = auto()
ADDRESS_LOCATION = auto()
PERSON = auto()
LOCATION = auto()
BIRTH_DATE = auto()
GENDER = auto()
NATIONALITY = auto()
IP_ADDRESS = auto()
SSN = auto()
USER_NAME = auto()
PASSWORD = auto()
ETHNICITY = auto()
TAX_ID = auto()
KEY = auto()
BANKACC = auto()
class ColumnNameScanner:
"""
Column Name Scanner to scan column name
"""
sensitive_regex = {
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
PiiTypes.CREDIT_CARD: re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
PiiTypes.BANKACC: re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
PiiTypes.USER_NAME: re.compile(
"^.*(user|client|person).*(name).*$", re.IGNORECASE
),
PiiTypes.PERSON: re.compile(
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
"SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
"BANKACC": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
"EMAIL": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
"PERSON": re.compile(
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
re.IGNORECASE,
),
}
non_sensitive_regex = {
PiiTypes.BIRTH_DATE: re.compile(
"BIRTH_DATE": re.compile(
"^.*(date_of_birth|dateofbirth|dob|"
"birthday|date_of_death|dateofdeath).*$",
re.IGNORECASE,
),
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
PiiTypes.ADDRESS: re.compile(
"GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
"NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
"ADDRESS": re.compile(
"^.*(address|city|state|county|country|"
"zipcode|zip|postal|zone|borough).*$",
re.IGNORECASE,
),
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
"PHONE": re.compile("^.*(phone).*$", re.IGNORECASE),
}
@classmethod

View File

@ -14,8 +14,11 @@ NER Scanner based on Presidio.
Supported Entities https://microsoft.github.io/presidio/supported_entities/
"""
import traceback
from collections import defaultdict
from enum import Enum
from typing import Any, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple
from pydantic import BaseModel
from metadata.pii import SPACY_EN_MODEL
from metadata.pii.models import TagAndConfidence, TagType
@ -25,21 +28,58 @@ logger = pii_logger()
class NEREntity(Enum):
"""
PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
"""
# Global
CREDIT_CARD = TagType.SENSITIVE.value
CRYPTO = TagType.SENSITIVE.value
DATE_TIME = TagType.NONSENSITIVE.value
EMAIL_ADDRESS = TagType.SENSITIVE.value
IBAN_CODE = TagType.SENSITIVE.value
IP_ADDRESS = TagType.SENSITIVE.value
NRP = TagType.NONSENSITIVE.value
LOCATION = TagType.NONSENSITIVE.value
PERSON = TagType.SENSITIVE.value
PHONE_NUMBER = TagType.NONSENSITIVE.value
MEDICAL_LICENSE = TagType.SENSITIVE.value
US_DRIVER_LICENSE = TagType.SENSITIVE.value
DATE_TIME = TagType.NONSENSITIVE.value
URL = TagType.NONSENSITIVE.value
# USA
US_BANK_NUMBER = TagType.SENSITIVE.value
US_SSN = TagType.SENSITIVE.value
PERSON = TagType.SENSITIVE.value
US_DRIVER_LICENSE = TagType.SENSITIVE.value
US_ITIN = TagType.SENSITIVE.value
US_PASSPORT = TagType.SENSITIVE.value
US_SSN = TagType.SENSITIVE.value
# UK
UK_NHS = TagType.SENSITIVE.value
# Spain
NIF = TagType.SENSITIVE.value
# Italy
IT_FISCAL_CODE = TagType.SENSITIVE.value
IT_DRIVER_LICENSE = TagType.SENSITIVE.value
IT_VAT_CODE = TagType.SENSITIVE.value
IT_PASSPORT = TagType.SENSITIVE.value
IT_IDENTITY_CARD = TagType.SENSITIVE.value
# Australia
AU_ABN = TagType.SENSITIVE.value
AU_ACN = TagType.SENSITIVE.value
AU_TFN = TagType.SENSITIVE.value
AU_MEDICARE = TagType.SENSITIVE.value
class StringAnalysis(BaseModel):
"""
Used to store results from the sample data scans for each NER Entity
"""
score: float
appearances: int
# pylint: disable=import-outside-toplevel
@ -68,46 +108,61 @@ class NERScanner:
@staticmethod
def get_highest_score_label(
labels_score, str_sample_data_rows: List[str]
) -> Tuple[Optional[str], Optional[float]]:
most_used_label_occurrence = 0
label_score = None
for label, score in labels_score.items():
if score[0] == 1.0 and score[1] > len(str_sample_data_rows) * 0.8:
return (label, score[0])
if score[1] > most_used_label_occurrence:
label_score = (label, score[0])
most_used_label_occurrence = score[1]
return label_score or (None, None)
entities_score: Dict[str, StringAnalysis]
) -> Tuple[str, float]:
top_entity = max(
entities_score,
key=lambda type_: entities_score[type_].score
* entities_score[type_].appearances
* 0.8,
)
return top_entity, entities_score[top_entity].score
def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
"""
Scan the column's sample data rows and look for PII
Scan the column's sample data rows and look for PII.
How this works:
1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
2. Then, for each s_i:
a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
For example, the result of analyzing `123456789` gives us
[
type: DATE_TIME, start: 0, end: 9, score: 0.85,
type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
type: US_PASSPORT, start: 0, end: 9, score: 0.05,
type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
]
b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
3. After gathering all the results for each row, get the `Entity` with maximum overall score
and number of appearances. This gets computed as "score * appearances * 0.8", which can
be thought as the "score" times "weighted down appearances".
4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
"""
logger.debug("Processing '%s'", sample_data_rows)
labels_score = {}
# Initialize an empty dict for the given row list
entities_score: Dict[str, StringAnalysis] = defaultdict(
lambda: StringAnalysis(score=0, appearances=0)
)
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
for row in str_sample_data_rows:
try:
results = self.analyzer.analyze(row, language="en")
for result in results:
logger.debug("Found %s", result.entity_type)
tag = result.entity_type
if tag in labels_score:
labels_score[tag] = (
result.score
if result.score > labels_score[tag][0]
else labels_score[tag][0],
labels_score[tag][1] + 1,
entities_score[result.entity_type] = StringAnalysis(
score=result.score
if result.score > entities_score[result.entity_type].score
else entities_score[result.entity_type].score,
appearances=entities_score[result.entity_type].appearances + 1,
)
else:
labels_score[tag] = (result.score, 1)
except Exception as exc:
logger.warning(f"Unknown error while processing {row} - {exc}")
logger.debug(traceback.format_exc())
label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
if label and score:
if entities_score:
label, score = self.get_highest_score_label(entities_score)
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
return TagAndConfidence(tag=tag_type, confidence=score)

View File

@ -69,6 +69,7 @@ class PIIProcessor:
"""
for idx, column in enumerate(table_entity.columns):
try:
# First, check if the column we are about to process
# already has PII tags or not
column_has_pii_tag = any(
@ -97,3 +98,5 @@ class PIIProcessor:
table_entity=table_entity,
column_name=table_entity.columns[idx].name.__root__,
)
except Exception as err:
logger.warning(f"Error computing PII tags for [{column}] - [{err}]")

View File

@ -51,3 +51,15 @@ class NERScannerTest(TestCase):
).tag,
TagType.SENSITIVE,
)
def test_scanner_nonsensitive(self):
self.assertEqual(
self.ner_scanner.scan(
[
"Washington",
"Alaska",
"Netherfield Lea Street",
]
).tag,
TagType.NONSENSITIVE,
)