Pere Menal-Ferrer a7e2f33adc
feature/pii-column-classifier (#21200)
* Add PII Tag and Sensitivity Level enums.

* Add feature-extraction for PII classification tasks

* Add faker as test dependency

* Add unit tests for presidio tag extractor

* Add PIISensitivityTags enum and update sensitivity mapping logic

* Add Presidio utility functions for PII analysis

* Extend column name regexs for PII

* Add colum name split

* Move pii algorithms to dedicated package

* Add tests for PAN, NIF, SSN entities

* Fix linting

* Add comment on why we need to set specific lanaguage to Presidio recognizers, as per PR suggestion.

* Fix version of faker to prevent flaky tests. Fix failing tests.

* Fix wrong import

---------

Co-authored-by: Pere Menal <pere.menal@getcollate.io>
2025-05-16 14:03:49 +02:00

181 lines
5.1 KiB
Python

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Simple data for testing.
In the future, we might want to use larger datasets to prevent regressions
of the classifiers. These datasets should then be stored in separate files in a format
like CSV, JSON or Parquet.
"""
from typing import List, Optional, TypedDict
from metadata.generated.schema.entity.data.table import DataType
from metadata.pii.algorithms.tags import PIITag
class LabeledData(TypedDict):
"""Labeled data for testing"""
column_name: Optional[str]
column_data_type: DataType
sample_data: list[str]
pii_tags: List[PIITag]
pii_sensitivity: bool
email_data: LabeledData = {
"column_name": "user_email",
"column_data_type": DataType.STRING,
"sample_data": [
"geraldc@gmail.com",
"saratimithi@godesign.com",
"heroldsean@google.com",
"wrong input from user",
],
"pii_tags": [PIITag.EMAIL_ADDRESS],
"pii_sensitivity": True,
}
url_data: LabeledData = {
"column_name": "user_url",
"column_data_type": DataType.STRING,
"sample_data": [
"https://www.example.com",
"http://example.com",
"https://example.com/path/to/resource",
"https://example.com/path/to/resource?query=param",
],
"pii_tags": [PIITag.URL],
"pii_sensitivity": False,
}
phone_data: LabeledData = {
"column_name": "user_phone",
"column_data_type": DataType.STRING,
"sample_data": [
"+1-202-555-0173",
"+1-202-555-0174",
"+1-202-555-0175",
"+1-202-555-0176",
],
"pii_tags": [PIITag.PHONE_NUMBER],
"pii_sensitivity": True,
}
non_pii_text_data: LabeledData = {
"column_name": "random_text",
"column_data_type": DataType.STRING,
"sample_data": [
"This is a random text without any PII.",
"Another random text.",
"Just some random words.",
],
"pii_tags": [],
"pii_sensitivity": False,
}
location_data: LabeledData = {
"column_name": "user_location",
"column_data_type": DataType.STRING,
"sample_data": [
"Washington",
"Alaska",
"Netherfield Lea Street",
],
"pii_tags": [PIITag.LOCATION],
"pii_sensitivity": True,
}
json_data = {
"column_name": "user_info",
"column_data_type": DataType.STRING,
"sample_data": [
'{"email": "johndoe@example.com", "address": {"street": "123 Main Street, Anytown, USA"}}',
'{"email": "potato", "age": 30, "preferences": {"newsletter": true, "notifications": "email"}}',
],
"pii_tags": [PIITag.EMAIL_ADDRESS, PIITag.LOCATION],
"pii_sensitivity": True,
}
json_no_pii_data = {
"column_name": "user_info",
"column_data_type": DataType.STRING,
"sample_data": [
'{"email": "foo", "address": {"street": "bar"}}',
'{"email": "potato", "age": 30, "preferences": {"newsletter": true, "notifications": "email"}}',
],
"pii_tags": [],
"pii_sensitivity": False,
}
# Valid aadhaar numbers
indian_aadhaar_data: LabeledData = {
"column_name": None,
"column_data_type": DataType.STRING,
"sample_data": [
"466299546357",
"967638147560",
"988307845186",
"6622-2350-9284",
"2161 6729 3627",
"8384-2795-9970",
"6213-3631-4249",
"1667-9750-5883",
"0249-3285-1294",
],
"pii_tags": [PIITag.IN_AADHAAR],
"pii_sensitivity": True,
}
indian_pan_data: LabeledData = {
"column_name": None,
"column_data_type": DataType.STRING,
"sample_data": [
"AFZPK7190K",
"BLQSM2938L",
"CWRTJ5821M",
"DZXNV9045A",
"EHYKG6752P",
],
"pii_tags": [PIITag.IN_PAN],
"pii_sensitivity": True,
}
us_ssn_data: LabeledData = {
"column_name": None,
"column_data_type": DataType.STRING,
"sample_data": [
"211-61-2524",
"123-45-6789",
"987-65-4321",
"543-21-0987",
"678-90-1234",
"876-54-3210",
],
"pii_tags": [PIITag.US_SSN],
"pii_sensitivity": True,
}
# ES NIF are correctly tagged with score of 1, other entities
# DATE_TIME, US_DRIVER_LICENSE are also tagged with score < 0.5
# TODO: Add a new field to the LabeledData to specify the winner tag
es_nif_data: LabeledData = {
"column_name": None, # Otherwise it will be confused with a phone number
"column_data_type": DataType.STRING,
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
"pii_tags": [
PIITag.ES_NIF,
PIITag.DATE_TIME,
PIITag.US_DRIVER_LICENSE, # low score
],
"pii_sensitivity": True,
}