mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-06 12:34:16 +00:00
Restructure NER Scanner internals (#11690)
* Simplify col name scanner * Restructure NER Scanner internals
This commit is contained in:
parent
fafbfdaeab
commit
0eb2201f94
@ -12,72 +12,42 @@
|
|||||||
Regex scanner for column names
|
Regex scanner for column names
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from enum import Enum, auto
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from metadata.pii.models import TagAndConfidence, TagType
|
from metadata.pii.models import TagAndConfidence, TagType
|
||||||
|
|
||||||
|
|
||||||
class PiiTypes(Enum):
|
|
||||||
"""
|
|
||||||
PiiTypes enumerates the different types of PII data
|
|
||||||
"""
|
|
||||||
|
|
||||||
NONE = auto()
|
|
||||||
UNSUPPORTED = auto()
|
|
||||||
PHONE = auto()
|
|
||||||
EMAIL = auto()
|
|
||||||
CREDIT_CARD = auto()
|
|
||||||
ADDRESS = auto()
|
|
||||||
ADDRESS_LOCATION = auto()
|
|
||||||
PERSON = auto()
|
|
||||||
LOCATION = auto()
|
|
||||||
BIRTH_DATE = auto()
|
|
||||||
GENDER = auto()
|
|
||||||
NATIONALITY = auto()
|
|
||||||
IP_ADDRESS = auto()
|
|
||||||
SSN = auto()
|
|
||||||
USER_NAME = auto()
|
|
||||||
PASSWORD = auto()
|
|
||||||
ETHNICITY = auto()
|
|
||||||
TAX_ID = auto()
|
|
||||||
KEY = auto()
|
|
||||||
BANKACC = auto()
|
|
||||||
|
|
||||||
|
|
||||||
class ColumnNameScanner:
|
class ColumnNameScanner:
|
||||||
"""
|
"""
|
||||||
Column Name Scanner to scan column name
|
Column Name Scanner to scan column name
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sensitive_regex = {
|
sensitive_regex = {
|
||||||
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
|
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
|
||||||
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
"SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
||||||
PiiTypes.CREDIT_CARD: re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
||||||
PiiTypes.BANKACC: re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
"BANKACC": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
||||||
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
"EMAIL": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
||||||
PiiTypes.USER_NAME: re.compile(
|
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
|
||||||
"^.*(user|client|person).*(name).*$", re.IGNORECASE
|
"PERSON": re.compile(
|
||||||
),
|
|
||||||
PiiTypes.PERSON: re.compile(
|
|
||||||
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
|
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
non_sensitive_regex = {
|
non_sensitive_regex = {
|
||||||
PiiTypes.BIRTH_DATE: re.compile(
|
"BIRTH_DATE": re.compile(
|
||||||
"^.*(date_of_birth|dateofbirth|dob|"
|
"^.*(date_of_birth|dateofbirth|dob|"
|
||||||
"birthday|date_of_death|dateofdeath).*$",
|
"birthday|date_of_death|dateofdeath).*$",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
),
|
),
|
||||||
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
|
"GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
|
||||||
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
|
"NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
|
||||||
PiiTypes.ADDRESS: re.compile(
|
"ADDRESS": re.compile(
|
||||||
"^.*(address|city|state|county|country|"
|
"^.*(address|city|state|county|country|"
|
||||||
"zipcode|zip|postal|zone|borough).*$",
|
"zipcode|zip|postal|zone|borough).*$",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
),
|
),
|
||||||
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
|
"PHONE": re.compile("^.*(phone).*$", re.IGNORECASE),
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -14,8 +14,11 @@ NER Scanner based on Presidio.
|
|||||||
Supported Entities https://microsoft.github.io/presidio/supported_entities/
|
Supported Entities https://microsoft.github.io/presidio/supported_entities/
|
||||||
"""
|
"""
|
||||||
import traceback
|
import traceback
|
||||||
|
from collections import defaultdict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from metadata.pii import SPACY_EN_MODEL
|
from metadata.pii import SPACY_EN_MODEL
|
||||||
from metadata.pii.models import TagAndConfidence, TagType
|
from metadata.pii.models import TagAndConfidence, TagType
|
||||||
@ -25,21 +28,58 @@ logger = pii_logger()
|
|||||||
|
|
||||||
|
|
||||||
class NEREntity(Enum):
|
class NEREntity(Enum):
|
||||||
|
"""
|
||||||
|
PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Global
|
||||||
CREDIT_CARD = TagType.SENSITIVE.value
|
CREDIT_CARD = TagType.SENSITIVE.value
|
||||||
|
CRYPTO = TagType.SENSITIVE.value
|
||||||
|
DATE_TIME = TagType.NONSENSITIVE.value
|
||||||
EMAIL_ADDRESS = TagType.SENSITIVE.value
|
EMAIL_ADDRESS = TagType.SENSITIVE.value
|
||||||
IBAN_CODE = TagType.SENSITIVE.value
|
IBAN_CODE = TagType.SENSITIVE.value
|
||||||
IP_ADDRESS = TagType.SENSITIVE.value
|
IP_ADDRESS = TagType.SENSITIVE.value
|
||||||
NRP = TagType.NONSENSITIVE.value
|
NRP = TagType.NONSENSITIVE.value
|
||||||
LOCATION = TagType.NONSENSITIVE.value
|
LOCATION = TagType.NONSENSITIVE.value
|
||||||
|
PERSON = TagType.SENSITIVE.value
|
||||||
PHONE_NUMBER = TagType.NONSENSITIVE.value
|
PHONE_NUMBER = TagType.NONSENSITIVE.value
|
||||||
MEDICAL_LICENSE = TagType.SENSITIVE.value
|
MEDICAL_LICENSE = TagType.SENSITIVE.value
|
||||||
US_DRIVER_LICENSE = TagType.SENSITIVE.value
|
|
||||||
DATE_TIME = TagType.NONSENSITIVE.value
|
|
||||||
URL = TagType.NONSENSITIVE.value
|
URL = TagType.NONSENSITIVE.value
|
||||||
|
|
||||||
|
# USA
|
||||||
US_BANK_NUMBER = TagType.SENSITIVE.value
|
US_BANK_NUMBER = TagType.SENSITIVE.value
|
||||||
US_SSN = TagType.SENSITIVE.value
|
US_DRIVER_LICENSE = TagType.SENSITIVE.value
|
||||||
PERSON = TagType.SENSITIVE.value
|
US_ITIN = TagType.SENSITIVE.value
|
||||||
US_PASSPORT = TagType.SENSITIVE.value
|
US_PASSPORT = TagType.SENSITIVE.value
|
||||||
|
US_SSN = TagType.SENSITIVE.value
|
||||||
|
|
||||||
|
# UK
|
||||||
|
UK_NHS = TagType.SENSITIVE.value
|
||||||
|
|
||||||
|
# Spain
|
||||||
|
NIF = TagType.SENSITIVE.value
|
||||||
|
|
||||||
|
# Italy
|
||||||
|
IT_FISCAL_CODE = TagType.SENSITIVE.value
|
||||||
|
IT_DRIVER_LICENSE = TagType.SENSITIVE.value
|
||||||
|
IT_VAT_CODE = TagType.SENSITIVE.value
|
||||||
|
IT_PASSPORT = TagType.SENSITIVE.value
|
||||||
|
IT_IDENTITY_CARD = TagType.SENSITIVE.value
|
||||||
|
|
||||||
|
# Australia
|
||||||
|
AU_ABN = TagType.SENSITIVE.value
|
||||||
|
AU_ACN = TagType.SENSITIVE.value
|
||||||
|
AU_TFN = TagType.SENSITIVE.value
|
||||||
|
AU_MEDICARE = TagType.SENSITIVE.value
|
||||||
|
|
||||||
|
|
||||||
|
class StringAnalysis(BaseModel):
|
||||||
|
"""
|
||||||
|
Used to store results from the sample data scans for each NER Entity
|
||||||
|
"""
|
||||||
|
|
||||||
|
score: float
|
||||||
|
appearances: int
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=import-outside-toplevel
|
# pylint: disable=import-outside-toplevel
|
||||||
@ -68,46 +108,61 @@ class NERScanner:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_highest_score_label(
|
def get_highest_score_label(
|
||||||
labels_score, str_sample_data_rows: List[str]
|
entities_score: Dict[str, StringAnalysis]
|
||||||
) -> Tuple[Optional[str], Optional[float]]:
|
) -> Tuple[str, float]:
|
||||||
most_used_label_occurrence = 0
|
top_entity = max(
|
||||||
label_score = None
|
entities_score,
|
||||||
for label, score in labels_score.items():
|
key=lambda type_: entities_score[type_].score
|
||||||
if score[0] == 1.0 and score[1] > len(str_sample_data_rows) * 0.8:
|
* entities_score[type_].appearances
|
||||||
return (label, score[0])
|
* 0.8,
|
||||||
if score[1] > most_used_label_occurrence:
|
)
|
||||||
label_score = (label, score[0])
|
return top_entity, entities_score[top_entity].score
|
||||||
most_used_label_occurrence = score[1]
|
|
||||||
return label_score or (None, None)
|
|
||||||
|
|
||||||
def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
|
def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
|
||||||
"""
|
"""
|
||||||
Scan the column's sample data rows and look for PII
|
Scan the column's sample data rows and look for PII.
|
||||||
|
|
||||||
|
How this works:
|
||||||
|
1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
|
||||||
|
2. Then, for each s_i:
|
||||||
|
a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
|
||||||
|
For example, the result of analyzing `123456789` gives us
|
||||||
|
[
|
||||||
|
type: DATE_TIME, start: 0, end: 9, score: 0.85,
|
||||||
|
type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
|
||||||
|
type: US_PASSPORT, start: 0, end: 9, score: 0.05,
|
||||||
|
type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
|
||||||
|
]
|
||||||
|
b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
|
||||||
|
3. After gathering all the results for each row, get the `Entity` with maximum overall score
|
||||||
|
and number of appearances. This gets computed as "score * appearances * 0.8", which can
|
||||||
|
be thought as the "score" times "weighted down appearances".
|
||||||
|
4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
|
||||||
"""
|
"""
|
||||||
logger.debug("Processing '%s'", sample_data_rows)
|
logger.debug("Processing '%s'", sample_data_rows)
|
||||||
labels_score = {}
|
|
||||||
|
# Initialize an empty dict for the given row list
|
||||||
|
entities_score: Dict[str, StringAnalysis] = defaultdict(
|
||||||
|
lambda: StringAnalysis(score=0, appearances=0)
|
||||||
|
)
|
||||||
|
|
||||||
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
|
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
|
||||||
for row in str_sample_data_rows:
|
for row in str_sample_data_rows:
|
||||||
try:
|
try:
|
||||||
results = self.analyzer.analyze(row, language="en")
|
results = self.analyzer.analyze(row, language="en")
|
||||||
for result in results:
|
for result in results:
|
||||||
logger.debug("Found %s", result.entity_type)
|
entities_score[result.entity_type] = StringAnalysis(
|
||||||
tag = result.entity_type
|
score=result.score
|
||||||
if tag in labels_score:
|
if result.score > entities_score[result.entity_type].score
|
||||||
labels_score[tag] = (
|
else entities_score[result.entity_type].score,
|
||||||
result.score
|
appearances=entities_score[result.entity_type].appearances + 1,
|
||||||
if result.score > labels_score[tag][0]
|
)
|
||||||
else labels_score[tag][0],
|
|
||||||
labels_score[tag][1] + 1,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
labels_score[tag] = (result.score, 1)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning(f"Unknown error while processing {row} - {exc}")
|
logger.warning(f"Unknown error while processing {row} - {exc}")
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
|
if entities_score:
|
||||||
if label and score:
|
label, score = self.get_highest_score_label(entities_score)
|
||||||
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
|
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
|
||||||
return TagAndConfidence(tag=tag_type, confidence=score)
|
return TagAndConfidence(tag=tag_type, confidence=score)
|
||||||
|
|
||||||
|
|||||||
@ -69,31 +69,34 @@ class PIIProcessor:
|
|||||||
"""
|
"""
|
||||||
for idx, column in enumerate(table_entity.columns):
|
for idx, column in enumerate(table_entity.columns):
|
||||||
|
|
||||||
# First, check if the column we are about to process
|
try:
|
||||||
# already has PII tags or not
|
# First, check if the column we are about to process
|
||||||
column_has_pii_tag = any(
|
# already has PII tags or not
|
||||||
(PII in tag.tagFQN.__root__ for tag in column.tags or [])
|
column_has_pii_tag = any(
|
||||||
)
|
(PII in tag.tagFQN.__root__ for tag in column.tags or [])
|
||||||
|
|
||||||
# If it has PII tags, we skip the processing
|
|
||||||
# for the column
|
|
||||||
if column_has_pii_tag is True:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Scan by column name. If no results there, check the sample data, if any
|
|
||||||
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
|
|
||||||
self.ner_scanner.scan([row[idx] for row in table_data.rows])
|
|
||||||
if table_data
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
tag_and_confidence
|
|
||||||
and tag_and_confidence.tag
|
|
||||||
and tag_and_confidence.confidence >= confidence_threshold / 100
|
|
||||||
):
|
|
||||||
self.patch_column_tag(
|
|
||||||
tag_type=tag_and_confidence.tag.value,
|
|
||||||
table_entity=table_entity,
|
|
||||||
column_name=table_entity.columns[idx].name.__root__,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If it has PII tags, we skip the processing
|
||||||
|
# for the column
|
||||||
|
if column_has_pii_tag is True:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Scan by column name. If no results there, check the sample data, if any
|
||||||
|
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
|
||||||
|
self.ner_scanner.scan([row[idx] for row in table_data.rows])
|
||||||
|
if table_data
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
tag_and_confidence
|
||||||
|
and tag_and_confidence.tag
|
||||||
|
and tag_and_confidence.confidence >= confidence_threshold / 100
|
||||||
|
):
|
||||||
|
self.patch_column_tag(
|
||||||
|
tag_type=tag_and_confidence.tag.value,
|
||||||
|
table_entity=table_entity,
|
||||||
|
column_name=table_entity.columns[idx].name.__root__,
|
||||||
|
)
|
||||||
|
except Exception as err:
|
||||||
|
logger.warning(f"Error computing PII tags for [{column}] - [{err}]")
|
||||||
|
|||||||
@ -51,3 +51,15 @@ class NERScannerTest(TestCase):
|
|||||||
).tag,
|
).tag,
|
||||||
TagType.SENSITIVE,
|
TagType.SENSITIVE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_scanner_nonsensitive(self):
|
||||||
|
self.assertEqual(
|
||||||
|
self.ner_scanner.scan(
|
||||||
|
[
|
||||||
|
"Washington",
|
||||||
|
"Alaska",
|
||||||
|
"Netherfield Lea Street",
|
||||||
|
]
|
||||||
|
).tag,
|
||||||
|
TagType.NONSENSITIVE,
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user