fix/indian-passport-detection (#21311)

* Remove 'ORGANIZATION' PII Tag as it is no longer supported by our PII detectors.

* Updata presidio version to fix wrong regex for indian passport

* Increase sample size of Indian passport numbers

---------

Co-authored-by: Pere Menal <pere.menal@getcollate.io>
(cherry picked from commit 3c6c762d9c0d7036124aae3a4dc90f51d6a674c0)
This commit is contained in:
Pere Menal-Ferrer 2025-05-20 15:32:21 +02:00 committed by OpenMetadata Release Bot
parent 363c22d50a
commit 6c5c9088ea
4 changed files with 28 additions and 5 deletions

View File

@ -66,7 +66,7 @@ VERSIONS = {
"google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
"pyathena": "pyathena~=3.0",
"sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.2.2",
"presidio-analyzer": "presidio-analyzer==2.2.355",
"presidio-analyzer": "presidio-analyzer==2.2.358",
}
COMMONS = {

View File

@ -40,7 +40,6 @@ class PIITag(enum.Enum):
PHONE_NUMBER = "PHONE_NUMBER"
MEDICAL_LICENSE = "MEDICAL_LICENSE"
URL = "URL"
ORGANIZATION = "ORGANIZATION" # Organization Name, not listed in Presidio website but used in the code
# USA
US_BANK_NUMBER = "US_BANK_NUMBER"

View File

@ -203,7 +203,7 @@ def test_us_ssn_extraction(fake_en_us, analyzer):
def test_aadhaar_extraction(analyzer):
# fake = local_fake_factory("en_IN") # Use Indian locale
# samples = [fake.aadhaar_id() for _ in range(100)]
# Unfortunately, the generated aadhaar_id by Faker are not valid
# Unfortunately, the generated aadhaar_ids by Faker are not always valid
samples = [
"466299546357",
"967638147560",
@ -224,7 +224,28 @@ def test_aadhaar_extraction(analyzer):
)
# TODO: Add more test for local entities
def test_indian_passport_extraction(analyzer):
# Randomly generated valid Indian passport numbers
samples = [
"A1234567",
"B7654321",
"C2345678",
"D3456789",
"E4567890",
"F5678901",
"G6789012",
"H7890123",
"J8901234",
"K9012345",
]
context = ["passport", "document"]
extracted = extract_pii_tags(analyzer, samples, context=context)
assert get_top_pii_tag(extracted) == PIITag.IN_PASSPORT, (
PIITag.IN_PASSPORT,
samples,
extracted,
)
def test_extract_pii_from_column_names():

View File

@ -25,4 +25,7 @@ def test_analyzer_supports_all_expected_pii_entities():
entities = set(PIITag.values())
supported_entities = set(analyzer.get_supported_entities(SUPPORTED_LANG))
assert entities <= supported_entities
assert entities <= supported_entities, (
f"Analyzer does not support all expected PII entities. "
f"{entities - supported_entities}"
)