mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2026-01-06 04:26:57 +00:00
fix/indian-passport-detection (#21311)
* Remove 'ORGANIZATION' PII Tag as it is no longer supported by our PII detectors. * Updata presidio version to fix wrong regex for indian passport * Increase sample size of Indian passport numbers --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io> (cherry picked from commit 3c6c762d9c0d7036124aae3a4dc90f51d6a674c0)
This commit is contained in:
parent
363c22d50a
commit
6c5c9088ea
@ -66,7 +66,7 @@ VERSIONS = {
|
||||
"google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
|
||||
"pyathena": "pyathena~=3.0",
|
||||
"sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.2.2",
|
||||
"presidio-analyzer": "presidio-analyzer==2.2.355",
|
||||
"presidio-analyzer": "presidio-analyzer==2.2.358",
|
||||
}
|
||||
|
||||
COMMONS = {
|
||||
|
||||
@ -40,7 +40,6 @@ class PIITag(enum.Enum):
|
||||
PHONE_NUMBER = "PHONE_NUMBER"
|
||||
MEDICAL_LICENSE = "MEDICAL_LICENSE"
|
||||
URL = "URL"
|
||||
ORGANIZATION = "ORGANIZATION" # Organization Name, not listed in Presidio website but used in the code
|
||||
|
||||
# USA
|
||||
US_BANK_NUMBER = "US_BANK_NUMBER"
|
||||
|
||||
@ -203,7 +203,7 @@ def test_us_ssn_extraction(fake_en_us, analyzer):
|
||||
def test_aadhaar_extraction(analyzer):
|
||||
# fake = local_fake_factory("en_IN") # Use Indian locale
|
||||
# samples = [fake.aadhaar_id() for _ in range(100)]
|
||||
# Unfortunately, the generated aadhaar_id by Faker are not valid
|
||||
# Unfortunately, the generated aadhaar_ids by Faker are not always valid
|
||||
samples = [
|
||||
"466299546357",
|
||||
"967638147560",
|
||||
@ -224,7 +224,28 @@ def test_aadhaar_extraction(analyzer):
|
||||
)
|
||||
|
||||
|
||||
# TODO: Add more test for local entities
|
||||
def test_indian_passport_extraction(analyzer):
|
||||
# Randomly generated valid Indian passport numbers
|
||||
samples = [
|
||||
"A1234567",
|
||||
"B7654321",
|
||||
"C2345678",
|
||||
"D3456789",
|
||||
"E4567890",
|
||||
"F5678901",
|
||||
"G6789012",
|
||||
"H7890123",
|
||||
"J8901234",
|
||||
"K9012345",
|
||||
]
|
||||
|
||||
context = ["passport", "document"]
|
||||
extracted = extract_pii_tags(analyzer, samples, context=context)
|
||||
assert get_top_pii_tag(extracted) == PIITag.IN_PASSPORT, (
|
||||
PIITag.IN_PASSPORT,
|
||||
samples,
|
||||
extracted,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_pii_from_column_names():
|
||||
|
||||
@ -25,4 +25,7 @@ def test_analyzer_supports_all_expected_pii_entities():
|
||||
|
||||
entities = set(PIITag.values())
|
||||
supported_entities = set(analyzer.get_supported_entities(SUPPORTED_LANG))
|
||||
assert entities <= supported_entities
|
||||
assert entities <= supported_entities, (
|
||||
f"Analyzer does not support all expected PII entities. "
|
||||
f"{entities - supported_entities}"
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user