From 6683c632f42b55a16db9df8acf5fd96b586ce301 Mon Sep 17 00:00:00 2001
From: Pere Menal-Ferrer
Date: Fri, 30 May 2025 08:18:50 +0200
Subject: [PATCH] FIX #21464 (#21463)
* Reproduce failing behaviour with non-date-time data
* Add a presidio patch for DateTimes
* Fix type-check error
---------
Co-authored-by: Pere Menal
---
.../metadata/pii/algorithms/classifiers.py | 8 +++-
.../pii/algorithms/presidio_patches.py | 38 +++++++++++++++++++
.../unit/pii/algorithms/data/pii_samples.py | 33 +++++++++++++++-
.../pii/algorithms/test_feature_extraction.py | 26 ++++++++++++-
4 files changed, 101 insertions(+), 4 deletions(-)
diff --git a/ingestion/src/metadata/pii/algorithms/classifiers.py b/ingestion/src/metadata/pii/algorithms/classifiers.py
index 35de6fee3ae..bfa6c621bfc 100644
--- a/ingestion/src/metadata/pii/algorithms/classifiers.py
+++ b/ingestion/src/metadata/pii/algorithms/classifiers.py
@@ -38,7 +38,11 @@ from metadata.pii.algorithms.feature_extraction import (
split_column_name,
)
from metadata.pii.algorithms.preprocessing import preprocess_values
-from metadata.pii.algorithms.presidio_patches import url_patcher
+from metadata.pii.algorithms.presidio_patches import (
+ combine_patchers,
+ date_time_patcher,
+ url_patcher,
+)
from metadata.pii.algorithms.presidio_utils import (
build_analyzer_engine,
set_presidio_logger_level,
@@ -119,7 +123,7 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
self._presidio_analyzer,
str_values,
context=context,
- recognizer_result_patcher=url_patcher,
+ recognizer_result_patcher=combine_patchers(date_time_patcher, url_patcher),
)
column_name_matches: Set[PIITag] = set()
diff --git a/ingestion/src/metadata/pii/algorithms/presidio_patches.py b/ingestion/src/metadata/pii/algorithms/presidio_patches.py
index 71b8f0bc31b..cbfb5432ef8 100644
--- a/ingestion/src/metadata/pii/algorithms/presidio_patches.py
+++ b/ingestion/src/metadata/pii/algorithms/presidio_patches.py
@@ -13,6 +13,7 @@ Patch the Presidio recognizer results to make adapt them to specific use cases.
"""
from typing import List, Protocol, Sequence
+from dateutil.parser import parse
from presidio_analyzer import RecognizerResult
@@ -29,6 +30,24 @@ class PresidioRecognizerResultPatcher(Protocol):
...
+def combine_patchers(
+ *patchers: PresidioRecognizerResultPatcher,
+) -> PresidioRecognizerResultPatcher:
+ """
+ Combine multiple patchers into one.
+ This allows us to apply multiple patches in sequence.
+ """
+
+ def combined_patcher(
+ recognizer_results: Sequence[RecognizerResult], text: str
+ ) -> Sequence[RecognizerResult]:
+ for patcher in patchers:
+ recognizer_results = patcher(recognizer_results, text)
+ return recognizer_results
+
+ return combined_patcher
+
+
def url_patcher(
recognizer_results: Sequence[RecognizerResult], text: str
) -> Sequence[RecognizerResult]:
@@ -43,3 +62,22 @@ def url_patcher(
continue
patched_result.append(result)
return patched_result
+
+
+def date_time_patcher(
+ recognizer_results: Sequence[RecognizerResult], text: str
+) -> Sequence[RecognizerResult]:
+ """
+ Patch the recognizer result to remove date time false positive with date.
+ """
+ patched_result: List[RecognizerResult] = []
+ for result in recognizer_results:
+ if result.entity_type == "DATE_TIME":
+ # try to parse using dateutils, if it fails, skip the result
+ try:
+ _ = parse(text[result.start : result.end])
+ except ValueError:
+ # if parsing fails, skip the result
+ continue
+ patched_result.append(result)
+ return patched_result
diff --git a/ingestion/tests/unit/pii/algorithms/data/pii_samples.py b/ingestion/tests/unit/pii/algorithms/data/pii_samples.py
index b029db804e1..9dc2869e0ab 100644
--- a/ingestion/tests/unit/pii/algorithms/data/pii_samples.py
+++ b/ingestion/tests/unit/pii/algorithms/data/pii_samples.py
@@ -70,6 +70,19 @@ phone_data: LabeledData = {
"pii_sensitivity": True,
}
+data_time_data: LabeledData = {
+ "column_name": "event_time",
+ "column_data_type": DataType.STRING,
+ "sample_data": [
+ "2023-10-01 12:00:00Z",
+ "2023-10-02 15:30:00Z",
+ "2023-10-03 18:45:00Z",
+ "2023-10-04 21:15:00Z",
+ ],
+ "pii_tags": [PIITag.DATE_TIME],
+ "pii_sensitivity": False,
+}
+
non_pii_text_data: LabeledData = {
"column_name": "random_text",
"column_data_type": DataType.STRING,
@@ -173,8 +186,26 @@ es_nif_data: LabeledData = {
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
"pii_tags": [
PIITag.ES_NIF,
- PIITag.DATE_TIME,
PIITag.US_DRIVER_LICENSE, # low score
],
"pii_sensitivity": True,
}
+
+# Sample data for regression tests
+
+# Previously, this data was incorrectly tagged as PII.DATE_TIME
+false_positive_datetime_data: LabeledData = {
+ "column_name": None,
+ "column_data_type": DataType.STRING,
+ "sample_data": [
+ "60001",
+ "60002",
+ "60003",
+ "60004",
+ "60005",
+ "60006",
+ "60007",
+ ],
+ "pii_tags": [],
+ "pii_sensitivity": False,
+}
diff --git a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
index db490919162..4ce0fcf4d90 100644
--- a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
+++ b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py
@@ -16,7 +16,7 @@ from metadata.pii.algorithms.feature_extraction import (
extract_pii_tags,
split_column_name,
)
-from metadata.pii.algorithms.presidio_patches import url_patcher
+from metadata.pii.algorithms.presidio_patches import date_time_patcher, url_patcher
from metadata.pii.algorithms.tags import PIITag
@@ -133,6 +133,30 @@ def test_person_extraction(fake, analyzer):
)
+def test_date_time_extraction_false_positive_regression(fake, analyzer):
+ """
+ Regression test for a false positive where a timestamp was incorrectly
+ marked as a date by the Presidio analyzer.
+ """
+ not_dates = [60001, 60002, 60003, 60004, 60005]
+ not_dates_str = [str(date) for date in not_dates]
+ extracted = extract_pii_tags(
+ analyzer, not_dates_str, recognizer_result_patcher=date_time_patcher
+ )
+ assert PIITag.DATE_TIME not in extracted
+
+
+def test_date_time_extraction_with_patched_results(fake, analyzer):
+ # Generate a list of dates and times
+ samples = [str(fake.date_time_this_century()) for _ in range(100)]
+ # Patch the results to avoid false positives
+ extracted = extract_pii_tags(
+ analyzer, samples, recognizer_result_patcher=date_time_patcher
+ )
+
+ assert PIITag.DATE_TIME in extracted
+
+
# Extraction with patched URL
def test_email_address_extraction_does_not_extract_url(fake, analyzer):
samples = [fake.email() for _ in range(100)]