FIX #19765 - Improve Column Name Scanner (#20136)

(cherry picked from commit 2e7c9a087581297ba160260f4d3dc5d37e8e7dca)
This commit is contained in:
Pere Miquel Brull 2025-03-07 14:32:59 +01:00 committed by OpenMetadata Release Bot
parent 0e2f1bd916
commit f6658345a5
3 changed files with 10 additions and 7 deletions

View File

@ -129,12 +129,13 @@ class PIIProcessor(Processor):
if column_has_pii_tag is True:
return None
# Scan by column name. If no results there, check the sample data, if any
tag_and_confidence = self.name_scanner.scan(column.name.root) or (
# We'll scan first by sample data to prioritize the NER scanner
# If we find nothing, we'll check the column name
tag_and_confidence = (
self.ner_scanner.scan([row[idx] for row in table_data.rows])
if table_data
else None
)
) or self.name_scanner.scan(column.name.root)
if (
tag_and_confidence

View File

@ -22,16 +22,14 @@ from metadata.utils import fqn
class ColumnNameScanner(BaseScanner):
"""
Column Name Scanner to scan column name
"""
"""Column Name Scanner to scan column name"""
sensitive_regex = {
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
"US_SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
"BANK_ACCOUNT": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
"EMAIL_ADDRESS": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
"EMAIL_ADDRESS": re.compile("^(email|e-mail|mail)(.*address)?$", re.IGNORECASE),
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
"PERSON": re.compile(
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",

View File

@ -40,6 +40,9 @@ def test_column_names_none(scanner):
assert scanner.scan("id") is None
assert scanner.scan("user_id") is None
# Mails
assert scanner.scan("email_verified") is None
def test_column_names_sensitive(scanner):
# Bank
@ -59,4 +62,5 @@ def test_column_names_sensitive(scanner):
assert scanner.scan("client_last_name") == EXPECTED_SENSITIVE
assert scanner.scan("email") == EXPECTED_SENSITIVE
assert scanner.scan("email_address") == EXPECTED_SENSITIVE
assert scanner.scan("ssn") == EXPECTED_SENSITIVE