diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py index 7395371ab2d..d85b1857fb3 100644 --- a/ingestion/src/metadata/pii/processor.py +++ b/ingestion/src/metadata/pii/processor.py @@ -129,12 +129,13 @@ class PIIProcessor(Processor): if column_has_pii_tag is True: return None - # Scan by column name. If no results there, check the sample data, if any - tag_and_confidence = self.name_scanner.scan(column.name.root) or ( + # We'll scan first by sample data to prioritize the NER scanner + # If we find nothing, we'll check the column name + tag_and_confidence = ( self.ner_scanner.scan([row[idx] for row in table_data.rows]) if table_data else None - ) + ) or self.name_scanner.scan(column.name.root) if ( tag_and_confidence diff --git a/ingestion/src/metadata/pii/scanners/column_name_scanner.py b/ingestion/src/metadata/pii/scanners/column_name_scanner.py index fe25af692c5..0d5b5a49abb 100644 --- a/ingestion/src/metadata/pii/scanners/column_name_scanner.py +++ b/ingestion/src/metadata/pii/scanners/column_name_scanner.py @@ -22,16 +22,14 @@ from metadata.utils import fqn class ColumnNameScanner(BaseScanner): - """ - Column Name Scanner to scan column name - """ + """Column Name Scanner to scan column name""" sensitive_regex = { "PASSWORD": re.compile("^.*password.*$", re.IGNORECASE), "US_SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE), "CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE), "BANK_ACCOUNT": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE), - "EMAIL_ADDRESS": re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE), + "EMAIL_ADDRESS": re.compile("^(email|e-mail|mail)(.*address)?$", re.IGNORECASE), "USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE), "PERSON": re.compile( "^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$", diff --git a/ingestion/tests/unit/pii/test_column_name_scanner.py b/ingestion/tests/unit/pii/test_column_name_scanner.py index 4f4307b7a76..7dc467704cd 100644 --- a/ingestion/tests/unit/pii/test_column_name_scanner.py +++ b/ingestion/tests/unit/pii/test_column_name_scanner.py @@ -40,6 +40,9 @@ def test_column_names_none(scanner): assert scanner.scan("id") is None assert scanner.scan("user_id") is None + # Mails + assert scanner.scan("email_verified") is None + def test_column_names_sensitive(scanner): # Bank @@ -59,4 +62,5 @@ def test_column_names_sensitive(scanner): assert scanner.scan("client_last_name") == EXPECTED_SENSITIVE assert scanner.scan("email") == EXPECTED_SENSITIVE + assert scanner.scan("email_address") == EXPECTED_SENSITIVE assert scanner.scan("ssn") == EXPECTED_SENSITIVE