* Reproduce failing behaviour with non-date-time data

* Add a presidio patch for DateTimes

* Fix type-check error

---------

Co-authored-by: Pere Menal <pere.menal@getcollate.io>
This commit is contained in:
Pere Menal-Ferrer 2025-05-30 08:18:50 +02:00 committed by GitHub
parent 324aab71a4
commit 6683c632f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 101 additions and 4 deletions

View File

@ -38,7 +38,11 @@ from metadata.pii.algorithms.feature_extraction import (
split_column_name, split_column_name,
) )
from metadata.pii.algorithms.preprocessing import preprocess_values from metadata.pii.algorithms.preprocessing import preprocess_values
from metadata.pii.algorithms.presidio_patches import url_patcher from metadata.pii.algorithms.presidio_patches import (
combine_patchers,
date_time_patcher,
url_patcher,
)
from metadata.pii.algorithms.presidio_utils import ( from metadata.pii.algorithms.presidio_utils import (
build_analyzer_engine, build_analyzer_engine,
set_presidio_logger_level, set_presidio_logger_level,
@ -119,7 +123,7 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
self._presidio_analyzer, self._presidio_analyzer,
str_values, str_values,
context=context, context=context,
recognizer_result_patcher=url_patcher, recognizer_result_patcher=combine_patchers(date_time_patcher, url_patcher),
) )
column_name_matches: Set[PIITag] = set() column_name_matches: Set[PIITag] = set()

View File

@ -13,6 +13,7 @@ Patch the Presidio recognizer results to make adapt them to specific use cases.
""" """
from typing import List, Protocol, Sequence from typing import List, Protocol, Sequence
from dateutil.parser import parse
from presidio_analyzer import RecognizerResult from presidio_analyzer import RecognizerResult
@ -29,6 +30,24 @@ class PresidioRecognizerResultPatcher(Protocol):
... ...
def combine_patchers(
*patchers: PresidioRecognizerResultPatcher,
) -> PresidioRecognizerResultPatcher:
"""
Combine multiple patchers into one.
This allows us to apply multiple patches in sequence.
"""
def combined_patcher(
recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]:
for patcher in patchers:
recognizer_results = patcher(recognizer_results, text)
return recognizer_results
return combined_patcher
def url_patcher( def url_patcher(
recognizer_results: Sequence[RecognizerResult], text: str recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]: ) -> Sequence[RecognizerResult]:
@ -43,3 +62,22 @@ def url_patcher(
continue continue
patched_result.append(result) patched_result.append(result)
return patched_result return patched_result
def date_time_patcher(
recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]:
"""
Patch the recognizer result to remove date time false positive with date.
"""
patched_result: List[RecognizerResult] = []
for result in recognizer_results:
if result.entity_type == "DATE_TIME":
# try to parse using dateutils, if it fails, skip the result
try:
_ = parse(text[result.start : result.end])
except ValueError:
# if parsing fails, skip the result
continue
patched_result.append(result)
return patched_result

View File

@ -70,6 +70,19 @@ phone_data: LabeledData = {
"pii_sensitivity": True, "pii_sensitivity": True,
} }
data_time_data: LabeledData = {
"column_name": "event_time",
"column_data_type": DataType.STRING,
"sample_data": [
"2023-10-01 12:00:00Z",
"2023-10-02 15:30:00Z",
"2023-10-03 18:45:00Z",
"2023-10-04 21:15:00Z",
],
"pii_tags": [PIITag.DATE_TIME],
"pii_sensitivity": False,
}
non_pii_text_data: LabeledData = { non_pii_text_data: LabeledData = {
"column_name": "random_text", "column_name": "random_text",
"column_data_type": DataType.STRING, "column_data_type": DataType.STRING,
@ -173,8 +186,26 @@ es_nif_data: LabeledData = {
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"], "sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
"pii_tags": [ "pii_tags": [
PIITag.ES_NIF, PIITag.ES_NIF,
PIITag.DATE_TIME,
PIITag.US_DRIVER_LICENSE, # low score PIITag.US_DRIVER_LICENSE, # low score
], ],
"pii_sensitivity": True, "pii_sensitivity": True,
} }
# Sample data for regression tests
# Previously, this data was incorrectly tagged as PII.DATE_TIME
false_positive_datetime_data: LabeledData = {
"column_name": None,
"column_data_type": DataType.STRING,
"sample_data": [
"60001",
"60002",
"60003",
"60004",
"60005",
"60006",
"60007",
],
"pii_tags": [],
"pii_sensitivity": False,
}

View File

@ -16,7 +16,7 @@ from metadata.pii.algorithms.feature_extraction import (
extract_pii_tags, extract_pii_tags,
split_column_name, split_column_name,
) )
from metadata.pii.algorithms.presidio_patches import url_patcher from metadata.pii.algorithms.presidio_patches import date_time_patcher, url_patcher
from metadata.pii.algorithms.tags import PIITag from metadata.pii.algorithms.tags import PIITag
@ -133,6 +133,30 @@ def test_person_extraction(fake, analyzer):
) )
def test_date_time_extraction_false_positive_regression(fake, analyzer):
"""
Regression test for a false positive where a timestamp was incorrectly
marked as a date by the Presidio analyzer.
"""
not_dates = [60001, 60002, 60003, 60004, 60005]
not_dates_str = [str(date) for date in not_dates]
extracted = extract_pii_tags(
analyzer, not_dates_str, recognizer_result_patcher=date_time_patcher
)
assert PIITag.DATE_TIME not in extracted
def test_date_time_extraction_with_patched_results(fake, analyzer):
# Generate a list of dates and times
samples = [str(fake.date_time_this_century()) for _ in range(100)]
# Patch the results to avoid false positives
extracted = extract_pii_tags(
analyzer, samples, recognizer_result_patcher=date_time_patcher
)
assert PIITag.DATE_TIME in extracted
# Extraction with patched URL # Extraction with patched URL
def test_email_address_extraction_does_not_extract_url(fake, analyzer): def test_email_address_extraction_does_not_extract_url(fake, analyzer):
samples = [fake.email() for _ in range(100)] samples = [fake.email() for _ in range(100)]