From 6c5c9088ea542685f2b41a71ea1043cdd448ec7d Mon Sep 17 00:00:00 2001
From: Pere Menal-Ferrer
Date: Tue, 20 May 2025 15:32:21 +0200
Subject: [PATCH] fix/indian-passport-detection (#21311)
* Remove 'ORGANIZATION' PII Tag as it is no longer supported by our PII detectors.
* Updata presidio version to fix wrong regex for indian passport
* Increase sample size of Indian passport numbers
---------
Co-authored-by: Pere Menal
(cherry picked from commit 3c6c762d9c0d7036124aae3a4dc90f51d6a674c0)
---
ingestion/setup.py | 2 +-
ingestion/src/metadata/pii/algorithms/tags.py | 1 -
.../pii/algorithms/test_feature_extraction.py | 25 +++++++++++++++++--
.../pii/algorithms/test_presidio_utils.py | 5 +++-
4 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/ingestion/setup.py b/ingestion/setup.py
index 39deb91a07f..dda9edc8172 100644
--- a/ingestion/setup.py
+++ b/ingestion/setup.py
@@ -66,7 +66,7 @@ VERSIONS = {
"google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
"pyathena": "pyathena~=3.0",
"sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.2.2",
- "presidio-analyzer": "presidio-analyzer==2.2.355",
+ "presidio-analyzer": "presidio-analyzer==2.2.358",
}
COMMONS = {
diff --git a/ingestion/src/metadata/pii/algorithms/tags.py b/ingestion/src/metadata/pii/algorithms/tags.py
index cf28af49ddc..5281bfd31d5 100644
--- a/ingestion/src/metadata/pii/algorithms/tags.py
+++ b/ingestion/src/metadata/pii/algorithms/tags.py
@@ -40,7 +40,6 @@ class PIITag(enum.Enum):
PHONE_NUMBER = "PHONE_NUMBER"
MEDICAL_LICENSE = "MEDICAL_LICENSE"
URL = "URL"
- ORGANIZATION = "ORGANIZATION" # Organization Name, not listed in Presidio website but used in the code
# USA
US_BANK_NUMBER = "US_BANK_NUMBER"
diff --git a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
index a3024259d77..db490919162 100644
--- a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
+++ b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
@@ -203,7 +203,7 @@ def test_us_ssn_extraction(fake_en_us, analyzer):
def test_aadhaar_extraction(analyzer):
# fake = local_fake_factory("en_IN") # Use Indian locale
# samples = [fake.aadhaar_id() for _ in range(100)]
- # Unfortunately, the generated aadhaar_id by Faker are not valid
+ # Unfortunately, the generated aadhaar_ids by Faker are not always valid
samples = [
"466299546357",
"967638147560",
@@ -224,7 +224,28 @@ def test_aadhaar_extraction(analyzer):
)
-# TODO: Add more test for local entities
+def test_indian_passport_extraction(analyzer):
+ # Randomly generated valid Indian passport numbers
+ samples = [
+ "A1234567",
+ "B7654321",
+ "C2345678",
+ "D3456789",
+ "E4567890",
+ "F5678901",
+ "G6789012",
+ "H7890123",
+ "J8901234",
+ "K9012345",
+ ]
+
+ context = ["passport", "document"]
+ extracted = extract_pii_tags(analyzer, samples, context=context)
+ assert get_top_pii_tag(extracted) == PIITag.IN_PASSPORT, (
+ PIITag.IN_PASSPORT,
+ samples,
+ extracted,
+ )
def test_extract_pii_from_column_names():
diff --git a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
index 2660ec50e06..51b66131f66 100644
--- a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
+++ b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
@@ -25,4 +25,7 @@ def test_analyzer_supports_all_expected_pii_entities():
entities = set(PIITag.values())
supported_entities = set(analyzer.get_supported_entities(SUPPORTED_LANG))
- assert entities <= supported_entities
+ assert entities <= supported_entities, (
+ f"Analyzer does not support all expected PII entities. "
+ f"{entities - supported_entities}"
+ )