* Reproduce failing behaviour with non-date-time data

* Add a presidio patch for DateTimes

* Fix type-check error

---------

Co-authored-by: Pere Menal <pere.menal@getcollate.io>
This commit is contained in:
Pere Menal-Ferrer 2025-05-30 08:18:50 +02:00 committed by GitHub
parent 324aab71a4
commit 6683c632f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 101 additions and 4 deletions

View File

@ -38,7 +38,11 @@ from metadata.pii.algorithms.feature_extraction import (
split_column_name,
)
from metadata.pii.algorithms.preprocessing import preprocess_values
from metadata.pii.algorithms.presidio_patches import url_patcher
from metadata.pii.algorithms.presidio_patches import (
combine_patchers,
date_time_patcher,
url_patcher,
)
from metadata.pii.algorithms.presidio_utils import (
build_analyzer_engine,
set_presidio_logger_level,
@ -119,7 +123,7 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
self._presidio_analyzer,
str_values,
context=context,
recognizer_result_patcher=url_patcher,
recognizer_result_patcher=combine_patchers(date_time_patcher, url_patcher),
)
column_name_matches: Set[PIITag] = set()

View File

@ -13,6 +13,7 @@ Patch the Presidio recognizer results to make adapt them to specific use cases.
"""
from typing import List, Protocol, Sequence
from dateutil.parser import parse
from presidio_analyzer import RecognizerResult
@ -29,6 +30,24 @@ class PresidioRecognizerResultPatcher(Protocol):
...
def combine_patchers(
*patchers: PresidioRecognizerResultPatcher,
) -> PresidioRecognizerResultPatcher:
"""
Combine multiple patchers into one.
This allows us to apply multiple patches in sequence.
"""
def combined_patcher(
recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]:
for patcher in patchers:
recognizer_results = patcher(recognizer_results, text)
return recognizer_results
return combined_patcher
def url_patcher(
recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]:
@ -43,3 +62,22 @@ def url_patcher(
continue
patched_result.append(result)
return patched_result
def date_time_patcher(
recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]:
"""
Patch the recognizer result to remove date time false positive with date.
"""
patched_result: List[RecognizerResult] = []
for result in recognizer_results:
if result.entity_type == "DATE_TIME":
# try to parse using dateutils, if it fails, skip the result
try:
_ = parse(text[result.start : result.end])
except ValueError:
# if parsing fails, skip the result
continue
patched_result.append(result)
return patched_result

View File

@ -70,6 +70,19 @@ phone_data: LabeledData = {
"pii_sensitivity": True,
}
data_time_data: LabeledData = {
"column_name": "event_time",
"column_data_type": DataType.STRING,
"sample_data": [
"2023-10-01 12:00:00Z",
"2023-10-02 15:30:00Z",
"2023-10-03 18:45:00Z",
"2023-10-04 21:15:00Z",
],
"pii_tags": [PIITag.DATE_TIME],
"pii_sensitivity": False,
}
non_pii_text_data: LabeledData = {
"column_name": "random_text",
"column_data_type": DataType.STRING,
@ -173,8 +186,26 @@ es_nif_data: LabeledData = {
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
"pii_tags": [
PIITag.ES_NIF,
PIITag.DATE_TIME,
PIITag.US_DRIVER_LICENSE, # low score
],
"pii_sensitivity": True,
}
# Sample data for regression tests
# Previously, this data was incorrectly tagged as PII.DATE_TIME
false_positive_datetime_data: LabeledData = {
"column_name": None,
"column_data_type": DataType.STRING,
"sample_data": [
"60001",
"60002",
"60003",
"60004",
"60005",
"60006",
"60007",
],
"pii_tags": [],
"pii_sensitivity": False,
}

View File

@ -16,7 +16,7 @@ from metadata.pii.algorithms.feature_extraction import (
extract_pii_tags,
split_column_name,
)
from metadata.pii.algorithms.presidio_patches import url_patcher
from metadata.pii.algorithms.presidio_patches import date_time_patcher, url_patcher
from metadata.pii.algorithms.tags import PIITag
@ -133,6 +133,30 @@ def test_person_extraction(fake, analyzer):
)
def test_date_time_extraction_false_positive_regression(fake, analyzer):
"""
Regression test for a false positive where a timestamp was incorrectly
marked as a date by the Presidio analyzer.
"""
not_dates = [60001, 60002, 60003, 60004, 60005]
not_dates_str = [str(date) for date in not_dates]
extracted = extract_pii_tags(
analyzer, not_dates_str, recognizer_result_patcher=date_time_patcher
)
assert PIITag.DATE_TIME not in extracted
def test_date_time_extraction_with_patched_results(fake, analyzer):
# Generate a list of dates and times
samples = [str(fake.date_time_this_century()) for _ in range(100)]
# Patch the results to avoid false positives
extracted = extract_pii_tags(
analyzer, samples, recognizer_result_patcher=date_time_patcher
)
assert PIITag.DATE_TIME in extracted
# Extraction with patched URL
def test_email_address_extraction_does_not_extract_url(fake, analyzer):
samples = [fake.email() for _ in range(100)]