Restructure NER Scanner internals (#11690)

* Simplify col name scanner

* Restructure NER Scanner internals
This commit is contained in:
Pere Miquel Brull 2023-05-19 18:21:01 +02:00 committed by GitHub
parent fafbfdaeab
commit 0eb2201f94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 140 additions and 100 deletions

View File

@ -12,72 +12,42 @@
Regex scanner for column names Regex scanner for column names
""" """
import re import re
from enum import Enum, auto
from typing import Optional from typing import Optional
from metadata.pii.models import TagAndConfidence, TagType from metadata.pii.models import TagAndConfidence, TagType
class PiiTypes(Enum):
"""
PiiTypes enumerates the different types of PII data
"""
NONE = auto()
UNSUPPORTED = auto()
PHONE = auto()
EMAIL = auto()
CREDIT_CARD = auto()
ADDRESS = auto()
ADDRESS_LOCATION = auto()
PERSON = auto()
LOCATION = auto()
BIRTH_DATE = auto()
GENDER = auto()
NATIONALITY = auto()
IP_ADDRESS = auto()
SSN = auto()
USER_NAME = auto()
PASSWORD = auto()
ETHNICITY = auto()
TAX_ID = auto()
KEY = auto()
BANKACC = auto()
class ColumnNameScanner: class ColumnNameScanner:
""" """
Column Name Scanner to scan column name Column Name Scanner to scan column name
""" """
sensitive_regex = { sensitive_regex = {
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE), "PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE), "SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
PiiTypes.CREDIT_CARD: re.compile("^.*(credit).*(card).*$", re.IGNORECASE), "CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
PiiTypes.BANKACC: re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE), "BANKACC": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE), "EMAIL": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
PiiTypes.USER_NAME: re.compile( "USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
"^.*(user|client|person).*(name).*$", re.IGNORECASE "PERSON": re.compile(
),
PiiTypes.PERSON: re.compile(
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$", "^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
re.IGNORECASE, re.IGNORECASE,
), ),
} }
non_sensitive_regex = { non_sensitive_regex = {
PiiTypes.BIRTH_DATE: re.compile( "BIRTH_DATE": re.compile(
"^.*(date_of_birth|dateofbirth|dob|" "^.*(date_of_birth|dateofbirth|dob|"
"birthday|date_of_death|dateofdeath).*$", "birthday|date_of_death|dateofdeath).*$",
re.IGNORECASE, re.IGNORECASE,
), ),
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE), "GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE), "NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
PiiTypes.ADDRESS: re.compile( "ADDRESS": re.compile(
"^.*(address|city|state|county|country|" "^.*(address|city|state|county|country|"
"zipcode|zip|postal|zone|borough).*$", "zipcode|zip|postal|zone|borough).*$",
re.IGNORECASE, re.IGNORECASE,
), ),
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE), "PHONE": re.compile("^.*(phone).*$", re.IGNORECASE),
} }
@classmethod @classmethod

View File

@ -14,8 +14,11 @@ NER Scanner based on Presidio.
Supported Entities https://microsoft.github.io/presidio/supported_entities/ Supported Entities https://microsoft.github.io/presidio/supported_entities/
""" """
import traceback import traceback
from collections import defaultdict
from enum import Enum from enum import Enum
from typing import Any, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from pydantic import BaseModel
from metadata.pii import SPACY_EN_MODEL from metadata.pii import SPACY_EN_MODEL
from metadata.pii.models import TagAndConfidence, TagType from metadata.pii.models import TagAndConfidence, TagType
@ -25,21 +28,58 @@ logger = pii_logger()
class NEREntity(Enum): class NEREntity(Enum):
"""
PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
"""
# Global
CREDIT_CARD = TagType.SENSITIVE.value CREDIT_CARD = TagType.SENSITIVE.value
CRYPTO = TagType.SENSITIVE.value
DATE_TIME = TagType.NONSENSITIVE.value
EMAIL_ADDRESS = TagType.SENSITIVE.value EMAIL_ADDRESS = TagType.SENSITIVE.value
IBAN_CODE = TagType.SENSITIVE.value IBAN_CODE = TagType.SENSITIVE.value
IP_ADDRESS = TagType.SENSITIVE.value IP_ADDRESS = TagType.SENSITIVE.value
NRP = TagType.NONSENSITIVE.value NRP = TagType.NONSENSITIVE.value
LOCATION = TagType.NONSENSITIVE.value LOCATION = TagType.NONSENSITIVE.value
PERSON = TagType.SENSITIVE.value
PHONE_NUMBER = TagType.NONSENSITIVE.value PHONE_NUMBER = TagType.NONSENSITIVE.value
MEDICAL_LICENSE = TagType.SENSITIVE.value MEDICAL_LICENSE = TagType.SENSITIVE.value
US_DRIVER_LICENSE = TagType.SENSITIVE.value
DATE_TIME = TagType.NONSENSITIVE.value
URL = TagType.NONSENSITIVE.value URL = TagType.NONSENSITIVE.value
# USA
US_BANK_NUMBER = TagType.SENSITIVE.value US_BANK_NUMBER = TagType.SENSITIVE.value
US_SSN = TagType.SENSITIVE.value US_DRIVER_LICENSE = TagType.SENSITIVE.value
PERSON = TagType.SENSITIVE.value US_ITIN = TagType.SENSITIVE.value
US_PASSPORT = TagType.SENSITIVE.value US_PASSPORT = TagType.SENSITIVE.value
US_SSN = TagType.SENSITIVE.value
# UK
UK_NHS = TagType.SENSITIVE.value
# Spain
NIF = TagType.SENSITIVE.value
# Italy
IT_FISCAL_CODE = TagType.SENSITIVE.value
IT_DRIVER_LICENSE = TagType.SENSITIVE.value
IT_VAT_CODE = TagType.SENSITIVE.value
IT_PASSPORT = TagType.SENSITIVE.value
IT_IDENTITY_CARD = TagType.SENSITIVE.value
# Australia
AU_ABN = TagType.SENSITIVE.value
AU_ACN = TagType.SENSITIVE.value
AU_TFN = TagType.SENSITIVE.value
AU_MEDICARE = TagType.SENSITIVE.value
class StringAnalysis(BaseModel):
"""
Used to store results from the sample data scans for each NER Entity
"""
score: float
appearances: int
# pylint: disable=import-outside-toplevel # pylint: disable=import-outside-toplevel
@ -68,46 +108,61 @@ class NERScanner:
@staticmethod @staticmethod
def get_highest_score_label( def get_highest_score_label(
labels_score, str_sample_data_rows: List[str] entities_score: Dict[str, StringAnalysis]
) -> Tuple[Optional[str], Optional[float]]: ) -> Tuple[str, float]:
most_used_label_occurrence = 0 top_entity = max(
label_score = None entities_score,
for label, score in labels_score.items(): key=lambda type_: entities_score[type_].score
if score[0] == 1.0 and score[1] > len(str_sample_data_rows) * 0.8: * entities_score[type_].appearances
return (label, score[0]) * 0.8,
if score[1] > most_used_label_occurrence: )
label_score = (label, score[0]) return top_entity, entities_score[top_entity].score
most_used_label_occurrence = score[1]
return label_score or (None, None)
def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]: def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
""" """
Scan the column's sample data rows and look for PII Scan the column's sample data rows and look for PII.
How this works:
1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
2. Then, for each s_i:
a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
For example, the result of analyzing `123456789` gives us
[
type: DATE_TIME, start: 0, end: 9, score: 0.85,
type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
type: US_PASSPORT, start: 0, end: 9, score: 0.05,
type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
]
b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
3. After gathering all the results for each row, get the `Entity` with maximum overall score
and number of appearances. This gets computed as "score * appearances * 0.8", which can
be thought as the "score" times "weighted down appearances".
4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
""" """
logger.debug("Processing '%s'", sample_data_rows) logger.debug("Processing '%s'", sample_data_rows)
labels_score = {}
# Initialize an empty dict for the given row list
entities_score: Dict[str, StringAnalysis] = defaultdict(
lambda: StringAnalysis(score=0, appearances=0)
)
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None] str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
for row in str_sample_data_rows: for row in str_sample_data_rows:
try: try:
results = self.analyzer.analyze(row, language="en") results = self.analyzer.analyze(row, language="en")
for result in results: for result in results:
logger.debug("Found %s", result.entity_type) entities_score[result.entity_type] = StringAnalysis(
tag = result.entity_type score=result.score
if tag in labels_score: if result.score > entities_score[result.entity_type].score
labels_score[tag] = ( else entities_score[result.entity_type].score,
result.score appearances=entities_score[result.entity_type].appearances + 1,
if result.score > labels_score[tag][0] )
else labels_score[tag][0],
labels_score[tag][1] + 1,
)
else:
labels_score[tag] = (result.score, 1)
except Exception as exc: except Exception as exc:
logger.warning(f"Unknown error while processing {row} - {exc}") logger.warning(f"Unknown error while processing {row} - {exc}")
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
label, score = self.get_highest_score_label(labels_score, str_sample_data_rows) if entities_score:
if label and score: label, score = self.get_highest_score_label(entities_score)
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
return TagAndConfidence(tag=tag_type, confidence=score) return TagAndConfidence(tag=tag_type, confidence=score)

View File

@ -69,31 +69,34 @@ class PIIProcessor:
""" """
for idx, column in enumerate(table_entity.columns): for idx, column in enumerate(table_entity.columns):
# First, check if the column we are about to process try:
# already has PII tags or not # First, check if the column we are about to process
column_has_pii_tag = any( # already has PII tags or not
(PII in tag.tagFQN.__root__ for tag in column.tags or []) column_has_pii_tag = any(
) (PII in tag.tagFQN.__root__ for tag in column.tags or [])
# If it has PII tags, we skip the processing
# for the column
if column_has_pii_tag is True:
continue
# Scan by column name. If no results there, check the sample data, if any
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
self.ner_scanner.scan([row[idx] for row in table_data.rows])
if table_data
else None
)
if (
tag_and_confidence
and tag_and_confidence.tag
and tag_and_confidence.confidence >= confidence_threshold / 100
):
self.patch_column_tag(
tag_type=tag_and_confidence.tag.value,
table_entity=table_entity,
column_name=table_entity.columns[idx].name.__root__,
) )
# If it has PII tags, we skip the processing
# for the column
if column_has_pii_tag is True:
continue
# Scan by column name. If no results there, check the sample data, if any
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
self.ner_scanner.scan([row[idx] for row in table_data.rows])
if table_data
else None
)
if (
tag_and_confidence
and tag_and_confidence.tag
and tag_and_confidence.confidence >= confidence_threshold / 100
):
self.patch_column_tag(
tag_type=tag_and_confidence.tag.value,
table_entity=table_entity,
column_name=table_entity.columns[idx].name.__root__,
)
except Exception as err:
logger.warning(f"Error computing PII tags for [{column}] - [{err}]")

View File

@ -51,3 +51,15 @@ class NERScannerTest(TestCase):
).tag, ).tag,
TagType.SENSITIVE, TagType.SENSITIVE,
) )
def test_scanner_nonsensitive(self):
self.assertEqual(
self.ner_scanner.scan(
[
"Washington",
"Alaska",
"Netherfield Lea Street",
]
).tag,
TagType.NONSENSITIVE,
)