From 6c5c9088ea542685f2b41a71ea1043cdd448ec7d Mon Sep 17 00:00:00 2001
From: Pere Menal-Ferrer <p.menal@stuart.com>
Date: Tue, 20 May 2025 15:32:21 +0200
Subject: [PATCH] fix/indian-passport-detection (#21311)

* Remove 'ORGANIZATION' PII Tag as it is no longer supported by our PII detectors.

* Updata presidio version to fix wrong regex for indian passport

* Increase sample size of Indian passport numbers

---------

Co-authored-by: Pere Menal <pere.menal@getcollate.io>
(cherry picked from commit 3c6c762d9c0d7036124aae3a4dc90f51d6a674c0)
---
 ingestion/setup.py                            |  2 +-
 ingestion/src/metadata/pii/algorithms/tags.py |  1 -
 .../pii/algorithms/test_feature_extraction.py | 25 +++++++++++++++++--
 .../pii/algorithms/test_presidio_utils.py     |  5 +++-
 4 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/ingestion/setup.py b/ingestion/setup.py
index 39deb91a07f..dda9edc8172 100644
--- a/ingestion/setup.py
+++ b/ingestion/setup.py
@@ -66,7 +66,7 @@ VERSIONS = {
     "google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
     "pyathena": "pyathena~=3.0",
     "sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.2.2",
-    "presidio-analyzer": "presidio-analyzer==2.2.355",
+    "presidio-analyzer": "presidio-analyzer==2.2.358",
 }
 
 COMMONS = {
diff --git a/ingestion/src/metadata/pii/algorithms/tags.py b/ingestion/src/metadata/pii/algorithms/tags.py
index cf28af49ddc..5281bfd31d5 100644
--- a/ingestion/src/metadata/pii/algorithms/tags.py
+++ b/ingestion/src/metadata/pii/algorithms/tags.py
@@ -40,7 +40,6 @@ class PIITag(enum.Enum):
     PHONE_NUMBER = "PHONE_NUMBER"
     MEDICAL_LICENSE = "MEDICAL_LICENSE"
     URL = "URL"
-    ORGANIZATION = "ORGANIZATION"  # Organization Name, not listed in Presidio website but used in the code
 
     # USA
     US_BANK_NUMBER = "US_BANK_NUMBER"
diff --git a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
index a3024259d77..db490919162 100644
--- a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
+++ b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
@@ -203,7 +203,7 @@ def test_us_ssn_extraction(fake_en_us, analyzer):
 def test_aadhaar_extraction(analyzer):
     # fake = local_fake_factory("en_IN")  # Use Indian locale
     # samples = [fake.aadhaar_id() for _ in range(100)]
-    # Unfortunately, the generated aadhaar_id by Faker are not valid
+    # Unfortunately, the generated aadhaar_ids by Faker are not always valid
     samples = [
         "466299546357",
         "967638147560",
@@ -224,7 +224,28 @@ def test_aadhaar_extraction(analyzer):
     )
 
 
-# TODO: Add more test for local entities
+def test_indian_passport_extraction(analyzer):
+    # Randomly generated valid Indian passport numbers
+    samples = [
+        "A1234567",
+        "B7654321",
+        "C2345678",
+        "D3456789",
+        "E4567890",
+        "F5678901",
+        "G6789012",
+        "H7890123",
+        "J8901234",
+        "K9012345",
+    ]
+
+    context = ["passport", "document"]
+    extracted = extract_pii_tags(analyzer, samples, context=context)
+    assert get_top_pii_tag(extracted) == PIITag.IN_PASSPORT, (
+        PIITag.IN_PASSPORT,
+        samples,
+        extracted,
+    )
 
 
 def test_extract_pii_from_column_names():
diff --git a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
index 2660ec50e06..51b66131f66 100644
--- a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
+++ b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
@@ -25,4 +25,7 @@ def test_analyzer_supports_all_expected_pii_entities():
 
     entities = set(PIITag.values())
     supported_entities = set(analyzer.get_supported_entities(SUPPORTED_LANG))
-    assert entities <= supported_entities
+    assert entities <= supported_entities, (
+        f"Analyzer does not support all expected PII entities. "
+        f"{entities - supported_entities}"
+    )