diff --git a/ingestion/src/metadata/pii/algorithms/classifiers.py b/ingestion/src/metadata/pii/algorithms/classifiers.py index 35de6fee3ae..bfa6c621bfc 100644 --- a/ingestion/src/metadata/pii/algorithms/classifiers.py +++ b/ingestion/src/metadata/pii/algorithms/classifiers.py @@ -38,7 +38,11 @@ from metadata.pii.algorithms.feature_extraction import ( split_column_name, ) from metadata.pii.algorithms.preprocessing import preprocess_values -from metadata.pii.algorithms.presidio_patches import url_patcher +from metadata.pii.algorithms.presidio_patches import ( + combine_patchers, + date_time_patcher, + url_patcher, +) from metadata.pii.algorithms.presidio_utils import ( build_analyzer_engine, set_presidio_logger_level, @@ -119,7 +123,7 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]): self._presidio_analyzer, str_values, context=context, - recognizer_result_patcher=url_patcher, + recognizer_result_patcher=combine_patchers(date_time_patcher, url_patcher), ) column_name_matches: Set[PIITag] = set() diff --git a/ingestion/src/metadata/pii/algorithms/presidio_patches.py b/ingestion/src/metadata/pii/algorithms/presidio_patches.py index 71b8f0bc31b..cbfb5432ef8 100644 --- a/ingestion/src/metadata/pii/algorithms/presidio_patches.py +++ b/ingestion/src/metadata/pii/algorithms/presidio_patches.py @@ -13,6 +13,7 @@ Patch the Presidio recognizer results to make adapt them to specific use cases. """ from typing import List, Protocol, Sequence +from dateutil.parser import parse from presidio_analyzer import RecognizerResult @@ -29,6 +30,24 @@ class PresidioRecognizerResultPatcher(Protocol): ... +def combine_patchers( + *patchers: PresidioRecognizerResultPatcher, +) -> PresidioRecognizerResultPatcher: + """ + Combine multiple patchers into one. + This allows us to apply multiple patches in sequence. + """ + + def combined_patcher( + recognizer_results: Sequence[RecognizerResult], text: str + ) -> Sequence[RecognizerResult]: + for patcher in patchers: + recognizer_results = patcher(recognizer_results, text) + return recognizer_results + + return combined_patcher + + def url_patcher( recognizer_results: Sequence[RecognizerResult], text: str ) -> Sequence[RecognizerResult]: @@ -43,3 +62,22 @@ def url_patcher( continue patched_result.append(result) return patched_result + + +def date_time_patcher( + recognizer_results: Sequence[RecognizerResult], text: str +) -> Sequence[RecognizerResult]: + """ + Patch the recognizer result to remove date time false positive with date. + """ + patched_result: List[RecognizerResult] = [] + for result in recognizer_results: + if result.entity_type == "DATE_TIME": + # try to parse using dateutils, if it fails, skip the result + try: + _ = parse(text[result.start : result.end]) + except ValueError: + # if parsing fails, skip the result + continue + patched_result.append(result) + return patched_result diff --git a/ingestion/tests/unit/pii/algorithms/data/pii_samples.py b/ingestion/tests/unit/pii/algorithms/data/pii_samples.py index b029db804e1..9dc2869e0ab 100644 --- a/ingestion/tests/unit/pii/algorithms/data/pii_samples.py +++ b/ingestion/tests/unit/pii/algorithms/data/pii_samples.py @@ -70,6 +70,19 @@ phone_data: LabeledData = { "pii_sensitivity": True, } +data_time_data: LabeledData = { + "column_name": "event_time", + "column_data_type": DataType.STRING, + "sample_data": [ + "2023-10-01 12:00:00Z", + "2023-10-02 15:30:00Z", + "2023-10-03 18:45:00Z", + "2023-10-04 21:15:00Z", + ], + "pii_tags": [PIITag.DATE_TIME], + "pii_sensitivity": False, +} + non_pii_text_data: LabeledData = { "column_name": "random_text", "column_data_type": DataType.STRING, @@ -173,8 +186,26 @@ es_nif_data: LabeledData = { "sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"], "pii_tags": [ PIITag.ES_NIF, - PIITag.DATE_TIME, PIITag.US_DRIVER_LICENSE, # low score ], "pii_sensitivity": True, } + +# Sample data for regression tests + +# Previously, this data was incorrectly tagged as PII.DATE_TIME +false_positive_datetime_data: LabeledData = { + "column_name": None, + "column_data_type": DataType.STRING, + "sample_data": [ + "60001", + "60002", + "60003", + "60004", + "60005", + "60006", + "60007", + ], + "pii_tags": [], + "pii_sensitivity": False, +} diff --git a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py index db490919162..4ce0fcf4d90 100644 --- a/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py +++ b/ingestion/tests/unit/pii/algorithms/test_feature_extraction.py @@ -16,7 +16,7 @@ from metadata.pii.algorithms.feature_extraction import ( extract_pii_tags, split_column_name, ) -from metadata.pii.algorithms.presidio_patches import url_patcher +from metadata.pii.algorithms.presidio_patches import date_time_patcher, url_patcher from metadata.pii.algorithms.tags import PIITag @@ -133,6 +133,30 @@ def test_person_extraction(fake, analyzer): ) +def test_date_time_extraction_false_positive_regression(fake, analyzer): + """ + Regression test for a false positive where a timestamp was incorrectly + marked as a date by the Presidio analyzer. + """ + not_dates = [60001, 60002, 60003, 60004, 60005] + not_dates_str = [str(date) for date in not_dates] + extracted = extract_pii_tags( + analyzer, not_dates_str, recognizer_result_patcher=date_time_patcher + ) + assert PIITag.DATE_TIME not in extracted + + +def test_date_time_extraction_with_patched_results(fake, analyzer): + # Generate a list of dates and times + samples = [str(fake.date_time_this_century()) for _ in range(100)] + # Patch the results to avoid false positives + extracted = extract_pii_tags( + analyzer, samples, recognizer_result_patcher=date_time_patcher + ) + + assert PIITag.DATE_TIME in extracted + + # Extraction with patched URL def test_email_address_extraction_does_not_extract_url(fake, analyzer): samples = [fake.email() for _ in range(100)]