mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-01 18:15:50 +00:00
(cherry picked from commit 2e7c9a087581297ba160260f4d3dc5d37e8e7dca)
This commit is contained in:
parent
0e2f1bd916
commit
f6658345a5
@ -129,12 +129,13 @@ class PIIProcessor(Processor):
|
||||
if column_has_pii_tag is True:
|
||||
return None
|
||||
|
||||
# Scan by column name. If no results there, check the sample data, if any
|
||||
tag_and_confidence = self.name_scanner.scan(column.name.root) or (
|
||||
# We'll scan first by sample data to prioritize the NER scanner
|
||||
# If we find nothing, we'll check the column name
|
||||
tag_and_confidence = (
|
||||
self.ner_scanner.scan([row[idx] for row in table_data.rows])
|
||||
if table_data
|
||||
else None
|
||||
)
|
||||
) or self.name_scanner.scan(column.name.root)
|
||||
|
||||
if (
|
||||
tag_and_confidence
|
||||
|
||||
@ -22,16 +22,14 @@ from metadata.utils import fqn
|
||||
|
||||
|
||||
class ColumnNameScanner(BaseScanner):
|
||||
"""
|
||||
Column Name Scanner to scan column name
|
||||
"""
|
||||
"""Column Name Scanner to scan column name"""
|
||||
|
||||
sensitive_regex = {
|
||||
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
|
||||
"US_SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
||||
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
||||
"BANK_ACCOUNT": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
||||
"EMAIL_ADDRESS": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
||||
"EMAIL_ADDRESS": re.compile("^(email|e-mail|mail)(.*address)?$", re.IGNORECASE),
|
||||
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
|
||||
"PERSON": re.compile(
|
||||
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
|
||||
|
||||
@ -40,6 +40,9 @@ def test_column_names_none(scanner):
|
||||
assert scanner.scan("id") is None
|
||||
assert scanner.scan("user_id") is None
|
||||
|
||||
# Mails
|
||||
assert scanner.scan("email_verified") is None
|
||||
|
||||
|
||||
def test_column_names_sensitive(scanner):
|
||||
# Bank
|
||||
@ -59,4 +62,5 @@ def test_column_names_sensitive(scanner):
|
||||
assert scanner.scan("client_last_name") == EXPECTED_SENSITIVE
|
||||
|
||||
assert scanner.scan("email") == EXPECTED_SENSITIVE
|
||||
assert scanner.scan("email_address") == EXPECTED_SENSITIVE
|
||||
assert scanner.scan("ssn") == EXPECTED_SENSITIVE
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user