mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-06 12:34:16 +00:00
(cherry picked from commit 2e7c9a087581297ba160260f4d3dc5d37e8e7dca)
This commit is contained in:
parent
0e2f1bd916
commit
f6658345a5
@ -129,12 +129,13 @@ class PIIProcessor(Processor):
|
|||||||
if column_has_pii_tag is True:
|
if column_has_pii_tag is True:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Scan by column name. If no results there, check the sample data, if any
|
# We'll scan first by sample data to prioritize the NER scanner
|
||||||
tag_and_confidence = self.name_scanner.scan(column.name.root) or (
|
# If we find nothing, we'll check the column name
|
||||||
|
tag_and_confidence = (
|
||||||
self.ner_scanner.scan([row[idx] for row in table_data.rows])
|
self.ner_scanner.scan([row[idx] for row in table_data.rows])
|
||||||
if table_data
|
if table_data
|
||||||
else None
|
else None
|
||||||
)
|
) or self.name_scanner.scan(column.name.root)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
tag_and_confidence
|
tag_and_confidence
|
||||||
|
|||||||
@ -22,16 +22,14 @@ from metadata.utils import fqn
|
|||||||
|
|
||||||
|
|
||||||
class ColumnNameScanner(BaseScanner):
|
class ColumnNameScanner(BaseScanner):
|
||||||
"""
|
"""Column Name Scanner to scan column name"""
|
||||||
Column Name Scanner to scan column name
|
|
||||||
"""
|
|
||||||
|
|
||||||
sensitive_regex = {
|
sensitive_regex = {
|
||||||
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
|
"PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
|
||||||
"US_SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
"US_SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
|
||||||
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
"CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
|
||||||
"BANK_ACCOUNT": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
"BANK_ACCOUNT": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
|
||||||
"EMAIL_ADDRESS": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
|
"EMAIL_ADDRESS": re.compile("^(email|e-mail|mail)(.*address)?$", re.IGNORECASE),
|
||||||
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
|
"USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
|
||||||
"PERSON": re.compile(
|
"PERSON": re.compile(
|
||||||
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
|
"^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
|
||||||
|
|||||||
@ -40,6 +40,9 @@ def test_column_names_none(scanner):
|
|||||||
assert scanner.scan("id") is None
|
assert scanner.scan("id") is None
|
||||||
assert scanner.scan("user_id") is None
|
assert scanner.scan("user_id") is None
|
||||||
|
|
||||||
|
# Mails
|
||||||
|
assert scanner.scan("email_verified") is None
|
||||||
|
|
||||||
|
|
||||||
def test_column_names_sensitive(scanner):
|
def test_column_names_sensitive(scanner):
|
||||||
# Bank
|
# Bank
|
||||||
@ -59,4 +62,5 @@ def test_column_names_sensitive(scanner):
|
|||||||
assert scanner.scan("client_last_name") == EXPECTED_SENSITIVE
|
assert scanner.scan("client_last_name") == EXPECTED_SENSITIVE
|
||||||
|
|
||||||
assert scanner.scan("email") == EXPECTED_SENSITIVE
|
assert scanner.scan("email") == EXPECTED_SENSITIVE
|
||||||
|
assert scanner.scan("email_address") == EXPECTED_SENSITIVE
|
||||||
assert scanner.scan("ssn") == EXPECTED_SENSITIVE
|
assert scanner.scan("ssn") == EXPECTED_SENSITIVE
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user