diff --git a/ingestion/setup.py b/ingestion/setup.py index 39deb91a07f..dda9edc8172 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -66,7 +66,7 @@ VERSIONS = { "google-cloud-bigtable": "google-cloud-bigtable>=2.0.0", "pyathena": "pyathena~=3.0", "sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.2.2", - "presidio-analyzer": "presidio-analyzer==2.2.355", + "presidio-analyzer": "presidio-analyzer==2.2.358", } COMMONS = { diff --git a/ingestion/src/metadata/pii/algorithms/tags.py b/ingestion/src/metadata/pii/algorithms/tags.py index cf28af49ddc..5281bfd31d5 100644 --- a/ingestion/src/metadata/pii/algorithms/tags.py +++ b/ingestion/src/metadata/pii/algorithms/tags.py @@ -40,7 +40,6 @@ class PIITag(enum.Enum): PHONE_NUMBER = "PHONE_NUMBER" MEDICAL_LICENSE = "MEDICAL_LICENSE" URL = "URL" - ORGANIZATION = "ORGANIZATION" # Organization Name, not listed in Presidio website but used in the code # USA US_BANK_NUMBER = "US_BANK_NUMBER" diff --git a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py index a3024259d77..db490919162 100644 --- a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py +++ b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py @@ -203,7 +203,7 @@ def test_us_ssn_extraction(fake_en_us, analyzer): def test_aadhaar_extraction(analyzer): # fake = local_fake_factory("en_IN") # Use Indian locale # samples = [fake.aadhaar_id() for _ in range(100)] - # Unfortunately, the generated aadhaar_id by Faker are not valid + # Unfortunately, the generated aadhaar_ids by Faker are not always valid samples = [ "466299546357", "967638147560", @@ -224,7 +224,28 @@ def test_aadhaar_extraction(analyzer): ) -# TODO: Add more test for local entities +def test_indian_passport_extraction(analyzer): + # Randomly generated valid Indian passport numbers + samples = [ + "A1234567", + "B7654321", + "C2345678", + "D3456789", + "E4567890", + "F5678901", + "G6789012", + "H7890123", + "J8901234", + "K9012345", + ] + + context = ["passport", "document"] + extracted = extract_pii_tags(analyzer, samples, context=context) + assert get_top_pii_tag(extracted) == PIITag.IN_PASSPORT, ( + PIITag.IN_PASSPORT, + samples, + extracted, + ) def test_extract_pii_from_column_names(): diff --git a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py index 2660ec50e06..51b66131f66 100644 --- a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py +++ b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py @@ -25,4 +25,7 @@ def test_analyzer_supports_all_expected_pii_entities(): entities = set(PIITag.values()) supported_entities = set(analyzer.get_supported_entities(SUPPORTED_LANG)) - assert entities <= supported_entities + assert entities <= supported_entities, ( + f"Analyzer does not support all expected PII entities. " + f"{entities - supported_entities}" + )