diff --git a/ingestion/setup.py b/ingestion/setup.py index 16d40684be8..aa5bd5aa27c 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -316,7 +316,7 @@ plugins: Dict[str, Set[str]] = { VERSIONS["spacy"], VERSIONS["pandas"], VERSIONS["numpy"], - "presidio-analyzer==2.2.32", + "presidio-analyzer==2.2.355", }, } diff --git a/ingestion/src/metadata/pii/ner.py b/ingestion/src/metadata/pii/ner.py index 6e8cd3ee8a8..c61e99b1e6a 100644 --- a/ingestion/src/metadata/pii/ner.py +++ b/ingestion/src/metadata/pii/ner.py @@ -51,7 +51,8 @@ class NEREntity(Enum): UK_NHS = TagType.SENSITIVE.value # Spain - NIF = TagType.SENSITIVE.value + ES_NIF = TagType.SENSITIVE.value + ES_NIE = TagType.SENSITIVE.value # Italy IT_FISCAL_CODE = TagType.SENSITIVE.value @@ -60,8 +61,25 @@ class NEREntity(Enum): IT_PASSPORT = TagType.SENSITIVE.value IT_IDENTITY_CARD = TagType.SENSITIVE.value + # Poland + PL_PESEL = TagType.SENSITIVE.value + + # Singapore + SG_NRIC_FIN = TagType.SENSITIVE.value + SG_UEN = TagType.SENSITIVE.value + # Australia AU_ABN = TagType.SENSITIVE.value AU_ACN = TagType.SENSITIVE.value AU_TFN = TagType.SENSITIVE.value AU_MEDICARE = TagType.SENSITIVE.value + + # India + IN_PAN = TagType.SENSITIVE.value + IN_AADHAAR = TagType.SENSITIVE.value + IN_VEHICLE_REGISTRATION = TagType.SENSITIVE.value + IN_VOTER = TagType.SENSITIVE.value + IN_PASSPORT = TagType.SENSITIVE.value + + # Finland + FI_PERSONAL_IDENTITY_CODE = TagType.SENSITIVE.value diff --git a/ingestion/tests/unit/pii/test_ner_scanner.py b/ingestion/tests/unit/pii/test_ner_scanner.py index f2c7c7abf37..c786e399ba3 100644 --- a/ingestion/tests/unit/pii/test_ner_scanner.py +++ b/ingestion/tests/unit/pii/test_ner_scanner.py @@ -141,3 +141,25 @@ def test_scanner_with_lists(scanner): ).tag_fqn == "PII.Sensitive" ) + + +def test_scan_entities(scanner): + """ + We can properly validate certain entities. + + > NOTE: These lists are randomly generated and not valid IDs for any actual use + """ + pan_numbers = ["AFZPK7190K", "BLQSM2938L", "CWRTJ5821M", "DZXNV9045A", "EHYKG6752P"] + assert scanner.scan(pan_numbers).tag_fqn == "PII.Sensitive" + + ssn_numbers = [ + "123-45-6789", + "987-65-4321", + "543-21-0987", + "678-90-1234", + "876-54-3210", + ] + assert scanner.scan(ssn_numbers).tag_fqn == "PII.Sensitive" + + nif_numbers = ["12345678A", "87654321B", "23456789C", "98765432D", "34567890E"] + assert scanner.scan(nif_numbers).tag_fqn == "PII.Sensitive"