mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-03 19:16:10 +00:00
Restructure NER Scanner internals (#11690)
* Simplify col name scanner * Restructure NER Scanner internals
This commit is contained in:
parent
fafbfdaeab
commit
0eb2201f94
@ -12,72 +12,42 @@
|
||||
Regex scanner for column names
|
||||
"""
|
||||
import re
|
||||
from enum import Enum, auto
|
||||
from typing import Optional
|
||||
|
||||
from metadata.pii.models import TagAndConfidence, TagType
|
||||
|
||||
|
||||
class PiiTypes(Enum):
|
||||
"""
|
||||
PiiTypes enumerates the different types of PII data
|
||||
"""
|
||||
|
||||
NONE = auto()
|
||||
UNSUPPORTED = auto()
|
||||
PHONE = auto()
|
||||
EMAIL = auto()
|
||||
CREDIT_CARD = auto()
|
||||
ADDRESS = auto()
|
||||
ADDRESS_LOCATION = auto()
|
||||
PERSON = auto()
|
||||
LOCATION = auto()
|
||||
BIRTH_DATE = auto()
|
||||
GENDER = auto()
|
||||
NATIONALITY = auto()
|
||||
IP_ADDRESS = auto()
|
||||
SSN = auto()
|
||||
USER_NAME = auto()
|
||||
PASSWORD = auto()
|
||||
ETHNICITY = auto()
|
||||
TAX_ID = auto()
|
||||
KEY = auto()
|
||||
BANKACC = auto()
|
||||
|
||||
|
||||
class ColumnNameScanner:
|
||||
"""
|
||||
Column Name Scanner to scan column name
|
||||
"""
|
||||
|
||||
sensitive_regex = {
|
||||
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
|
||||
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
||||
PiiTypes.CREDIT_CARD: re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
||||
PiiTypes.BANKACC: re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
||||
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
||||
PiiTypes.USER_NAME: re.compile(
|
||||
"^.*(user|client|person).*(name).*$", re.IGNORECASE
|
||||
),
|
||||
PiiTypes.PERSON: re.compile(
|
||||
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
|
||||
"SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
||||
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
||||
"BANKACC": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
||||
"EMAIL": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
||||
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
|
||||
"PERSON": re.compile(
|
||||
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
}
|
||||
non_sensitive_regex = {
|
||||
PiiTypes.BIRTH_DATE: re.compile(
|
||||
"BIRTH_DATE": re.compile(
|
||||
"^.*(date_of_birth|dateofbirth|dob|"
|
||||
"birthday|date_of_death|dateofdeath).*$",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
|
||||
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
|
||||
PiiTypes.ADDRESS: re.compile(
|
||||
"GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
|
||||
"NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
|
||||
"ADDRESS": re.compile(
|
||||
"^.*(address|city|state|county|country|"
|
||||
"zipcode|zip|postal|zone|borough).*$",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
|
||||
"PHONE": re.compile("^.*(phone).*$", re.IGNORECASE),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -14,8 +14,11 @@ NER Scanner based on Presidio.
|
||||
Supported Entities https://microsoft.github.io/presidio/supported_entities/
|
||||
"""
|
||||
import traceback
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
from typing import Any, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from metadata.pii import SPACY_EN_MODEL
|
||||
from metadata.pii.models import TagAndConfidence, TagType
|
||||
@ -25,21 +28,58 @@ logger = pii_logger()
|
||||
|
||||
|
||||
class NEREntity(Enum):
|
||||
"""
|
||||
PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
|
||||
"""
|
||||
|
||||
# Global
|
||||
CREDIT_CARD = TagType.SENSITIVE.value
|
||||
CRYPTO = TagType.SENSITIVE.value
|
||||
DATE_TIME = TagType.NONSENSITIVE.value
|
||||
EMAIL_ADDRESS = TagType.SENSITIVE.value
|
||||
IBAN_CODE = TagType.SENSITIVE.value
|
||||
IP_ADDRESS = TagType.SENSITIVE.value
|
||||
NRP = TagType.NONSENSITIVE.value
|
||||
LOCATION = TagType.NONSENSITIVE.value
|
||||
PERSON = TagType.SENSITIVE.value
|
||||
PHONE_NUMBER = TagType.NONSENSITIVE.value
|
||||
MEDICAL_LICENSE = TagType.SENSITIVE.value
|
||||
US_DRIVER_LICENSE = TagType.SENSITIVE.value
|
||||
DATE_TIME = TagType.NONSENSITIVE.value
|
||||
URL = TagType.NONSENSITIVE.value
|
||||
|
||||
# USA
|
||||
US_BANK_NUMBER = TagType.SENSITIVE.value
|
||||
US_SSN = TagType.SENSITIVE.value
|
||||
PERSON = TagType.SENSITIVE.value
|
||||
US_DRIVER_LICENSE = TagType.SENSITIVE.value
|
||||
US_ITIN = TagType.SENSITIVE.value
|
||||
US_PASSPORT = TagType.SENSITIVE.value
|
||||
US_SSN = TagType.SENSITIVE.value
|
||||
|
||||
# UK
|
||||
UK_NHS = TagType.SENSITIVE.value
|
||||
|
||||
# Spain
|
||||
NIF = TagType.SENSITIVE.value
|
||||
|
||||
# Italy
|
||||
IT_FISCAL_CODE = TagType.SENSITIVE.value
|
||||
IT_DRIVER_LICENSE = TagType.SENSITIVE.value
|
||||
IT_VAT_CODE = TagType.SENSITIVE.value
|
||||
IT_PASSPORT = TagType.SENSITIVE.value
|
||||
IT_IDENTITY_CARD = TagType.SENSITIVE.value
|
||||
|
||||
# Australia
|
||||
AU_ABN = TagType.SENSITIVE.value
|
||||
AU_ACN = TagType.SENSITIVE.value
|
||||
AU_TFN = TagType.SENSITIVE.value
|
||||
AU_MEDICARE = TagType.SENSITIVE.value
|
||||
|
||||
|
||||
class StringAnalysis(BaseModel):
|
||||
"""
|
||||
Used to store results from the sample data scans for each NER Entity
|
||||
"""
|
||||
|
||||
score: float
|
||||
appearances: int
|
||||
|
||||
|
||||
# pylint: disable=import-outside-toplevel
|
||||
@ -68,46 +108,61 @@ class NERScanner:
|
||||
|
||||
@staticmethod
|
||||
def get_highest_score_label(
|
||||
labels_score, str_sample_data_rows: List[str]
|
||||
) -> Tuple[Optional[str], Optional[float]]:
|
||||
most_used_label_occurrence = 0
|
||||
label_score = None
|
||||
for label, score in labels_score.items():
|
||||
if score[0] == 1.0 and score[1] > len(str_sample_data_rows) * 0.8:
|
||||
return (label, score[0])
|
||||
if score[1] > most_used_label_occurrence:
|
||||
label_score = (label, score[0])
|
||||
most_used_label_occurrence = score[1]
|
||||
return label_score or (None, None)
|
||||
entities_score: Dict[str, StringAnalysis]
|
||||
) -> Tuple[str, float]:
|
||||
top_entity = max(
|
||||
entities_score,
|
||||
key=lambda type_: entities_score[type_].score
|
||||
* entities_score[type_].appearances
|
||||
* 0.8,
|
||||
)
|
||||
return top_entity, entities_score[top_entity].score
|
||||
|
||||
def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
|
||||
"""
|
||||
Scan the column's sample data rows and look for PII
|
||||
Scan the column's sample data rows and look for PII.
|
||||
|
||||
How this works:
|
||||
1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
|
||||
2. Then, for each s_i:
|
||||
a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
|
||||
For example, the result of analyzing `123456789` gives us
|
||||
[
|
||||
type: DATE_TIME, start: 0, end: 9, score: 0.85,
|
||||
type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
|
||||
type: US_PASSPORT, start: 0, end: 9, score: 0.05,
|
||||
type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
|
||||
]
|
||||
b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
|
||||
3. After gathering all the results for each row, get the `Entity` with maximum overall score
|
||||
and number of appearances. This gets computed as "score * appearances * 0.8", which can
|
||||
be thought as the "score" times "weighted down appearances".
|
||||
4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
|
||||
"""
|
||||
logger.debug("Processing '%s'", sample_data_rows)
|
||||
labels_score = {}
|
||||
|
||||
# Initialize an empty dict for the given row list
|
||||
entities_score: Dict[str, StringAnalysis] = defaultdict(
|
||||
lambda: StringAnalysis(score=0, appearances=0)
|
||||
)
|
||||
|
||||
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
|
||||
for row in str_sample_data_rows:
|
||||
try:
|
||||
results = self.analyzer.analyze(row, language="en")
|
||||
for result in results:
|
||||
logger.debug("Found %s", result.entity_type)
|
||||
tag = result.entity_type
|
||||
if tag in labels_score:
|
||||
labels_score[tag] = (
|
||||
result.score
|
||||
if result.score > labels_score[tag][0]
|
||||
else labels_score[tag][0],
|
||||
labels_score[tag][1] + 1,
|
||||
entities_score[result.entity_type] = StringAnalysis(
|
||||
score=result.score
|
||||
if result.score > entities_score[result.entity_type].score
|
||||
else entities_score[result.entity_type].score,
|
||||
appearances=entities_score[result.entity_type].appearances + 1,
|
||||
)
|
||||
else:
|
||||
labels_score[tag] = (result.score, 1)
|
||||
except Exception as exc:
|
||||
logger.warning(f"Unknown error while processing {row} - {exc}")
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
|
||||
if label and score:
|
||||
if entities_score:
|
||||
label, score = self.get_highest_score_label(entities_score)
|
||||
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
|
||||
return TagAndConfidence(tag=tag_type, confidence=score)
|
||||
|
||||
|
||||
@ -69,6 +69,7 @@ class PIIProcessor:
|
||||
"""
|
||||
for idx, column in enumerate(table_entity.columns):
|
||||
|
||||
try:
|
||||
# First, check if the column we are about to process
|
||||
# already has PII tags or not
|
||||
column_has_pii_tag = any(
|
||||
@ -97,3 +98,5 @@ class PIIProcessor:
|
||||
table_entity=table_entity,
|
||||
column_name=table_entity.columns[idx].name.__root__,
|
||||
)
|
||||
except Exception as err:
|
||||
logger.warning(f"Error computing PII tags for [{column}] - [{err}]")
|
||||
|
||||
@ -51,3 +51,15 @@ class NERScannerTest(TestCase):
|
||||
).tag,
|
||||
TagType.SENSITIVE,
|
||||
)
|
||||
|
||||
def test_scanner_nonsensitive(self):
|
||||
self.assertEqual(
|
||||
self.ner_scanner.scan(
|
||||
[
|
||||
"Washington",
|
||||
"Alaska",
|
||||
"Netherfield Lea Street",
|
||||
]
|
||||
).tag,
|
||||
TagType.NONSENSITIVE,
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user