mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-11 16:58:38 +00:00
* Reproduce failing behaviour with non-date-time data * Add a presidio patch for DateTimes * Fix type-check error --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io>
This commit is contained in:
parent
324aab71a4
commit
6683c632f4
@ -38,7 +38,11 @@ from metadata.pii.algorithms.feature_extraction import (
|
||||
split_column_name,
|
||||
)
|
||||
from metadata.pii.algorithms.preprocessing import preprocess_values
|
||||
from metadata.pii.algorithms.presidio_patches import url_patcher
|
||||
from metadata.pii.algorithms.presidio_patches import (
|
||||
combine_patchers,
|
||||
date_time_patcher,
|
||||
url_patcher,
|
||||
)
|
||||
from metadata.pii.algorithms.presidio_utils import (
|
||||
build_analyzer_engine,
|
||||
set_presidio_logger_level,
|
||||
@ -119,7 +123,7 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
|
||||
self._presidio_analyzer,
|
||||
str_values,
|
||||
context=context,
|
||||
recognizer_result_patcher=url_patcher,
|
||||
recognizer_result_patcher=combine_patchers(date_time_patcher, url_patcher),
|
||||
)
|
||||
|
||||
column_name_matches: Set[PIITag] = set()
|
||||
|
@ -13,6 +13,7 @@ Patch the Presidio recognizer results to make adapt them to specific use cases.
|
||||
"""
|
||||
from typing import List, Protocol, Sequence
|
||||
|
||||
from dateutil.parser import parse
|
||||
from presidio_analyzer import RecognizerResult
|
||||
|
||||
|
||||
@ -29,6 +30,24 @@ class PresidioRecognizerResultPatcher(Protocol):
|
||||
...
|
||||
|
||||
|
||||
def combine_patchers(
|
||||
*patchers: PresidioRecognizerResultPatcher,
|
||||
) -> PresidioRecognizerResultPatcher:
|
||||
"""
|
||||
Combine multiple patchers into one.
|
||||
This allows us to apply multiple patches in sequence.
|
||||
"""
|
||||
|
||||
def combined_patcher(
|
||||
recognizer_results: Sequence[RecognizerResult], text: str
|
||||
) -> Sequence[RecognizerResult]:
|
||||
for patcher in patchers:
|
||||
recognizer_results = patcher(recognizer_results, text)
|
||||
return recognizer_results
|
||||
|
||||
return combined_patcher
|
||||
|
||||
|
||||
def url_patcher(
|
||||
recognizer_results: Sequence[RecognizerResult], text: str
|
||||
) -> Sequence[RecognizerResult]:
|
||||
@ -43,3 +62,22 @@ def url_patcher(
|
||||
continue
|
||||
patched_result.append(result)
|
||||
return patched_result
|
||||
|
||||
|
||||
def date_time_patcher(
|
||||
recognizer_results: Sequence[RecognizerResult], text: str
|
||||
) -> Sequence[RecognizerResult]:
|
||||
"""
|
||||
Patch the recognizer result to remove date time false positive with date.
|
||||
"""
|
||||
patched_result: List[RecognizerResult] = []
|
||||
for result in recognizer_results:
|
||||
if result.entity_type == "DATE_TIME":
|
||||
# try to parse using dateutils, if it fails, skip the result
|
||||
try:
|
||||
_ = parse(text[result.start : result.end])
|
||||
except ValueError:
|
||||
# if parsing fails, skip the result
|
||||
continue
|
||||
patched_result.append(result)
|
||||
return patched_result
|
||||
|
@ -70,6 +70,19 @@ phone_data: LabeledData = {
|
||||
"pii_sensitivity": True,
|
||||
}
|
||||
|
||||
data_time_data: LabeledData = {
|
||||
"column_name": "event_time",
|
||||
"column_data_type": DataType.STRING,
|
||||
"sample_data": [
|
||||
"2023-10-01 12:00:00Z",
|
||||
"2023-10-02 15:30:00Z",
|
||||
"2023-10-03 18:45:00Z",
|
||||
"2023-10-04 21:15:00Z",
|
||||
],
|
||||
"pii_tags": [PIITag.DATE_TIME],
|
||||
"pii_sensitivity": False,
|
||||
}
|
||||
|
||||
non_pii_text_data: LabeledData = {
|
||||
"column_name": "random_text",
|
||||
"column_data_type": DataType.STRING,
|
||||
@ -173,8 +186,26 @@ es_nif_data: LabeledData = {
|
||||
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
|
||||
"pii_tags": [
|
||||
PIITag.ES_NIF,
|
||||
PIITag.DATE_TIME,
|
||||
PIITag.US_DRIVER_LICENSE, # low score
|
||||
],
|
||||
"pii_sensitivity": True,
|
||||
}
|
||||
|
||||
# Sample data for regression tests
|
||||
|
||||
# Previously, this data was incorrectly tagged as PII.DATE_TIME
|
||||
false_positive_datetime_data: LabeledData = {
|
||||
"column_name": None,
|
||||
"column_data_type": DataType.STRING,
|
||||
"sample_data": [
|
||||
"60001",
|
||||
"60002",
|
||||
"60003",
|
||||
"60004",
|
||||
"60005",
|
||||
"60006",
|
||||
"60007",
|
||||
],
|
||||
"pii_tags": [],
|
||||
"pii_sensitivity": False,
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ from metadata.pii.algorithms.feature_extraction import (
|
||||
extract_pii_tags,
|
||||
split_column_name,
|
||||
)
|
||||
from metadata.pii.algorithms.presidio_patches import url_patcher
|
||||
from metadata.pii.algorithms.presidio_patches import date_time_patcher, url_patcher
|
||||
from metadata.pii.algorithms.tags import PIITag
|
||||
|
||||
|
||||
@ -133,6 +133,30 @@ def test_person_extraction(fake, analyzer):
|
||||
)
|
||||
|
||||
|
||||
def test_date_time_extraction_false_positive_regression(fake, analyzer):
|
||||
"""
|
||||
Regression test for a false positive where a timestamp was incorrectly
|
||||
marked as a date by the Presidio analyzer.
|
||||
"""
|
||||
not_dates = [60001, 60002, 60003, 60004, 60005]
|
||||
not_dates_str = [str(date) for date in not_dates]
|
||||
extracted = extract_pii_tags(
|
||||
analyzer, not_dates_str, recognizer_result_patcher=date_time_patcher
|
||||
)
|
||||
assert PIITag.DATE_TIME not in extracted
|
||||
|
||||
|
||||
def test_date_time_extraction_with_patched_results(fake, analyzer):
|
||||
# Generate a list of dates and times
|
||||
samples = [str(fake.date_time_this_century()) for _ in range(100)]
|
||||
# Patch the results to avoid false positives
|
||||
extracted = extract_pii_tags(
|
||||
analyzer, samples, recognizer_result_patcher=date_time_patcher
|
||||
)
|
||||
|
||||
assert PIITag.DATE_TIME in extracted
|
||||
|
||||
|
||||
# Extraction with patched URL
|
||||
def test_email_address_extraction_does_not_extract_url(fake, analyzer):
|
||||
samples = [fake.email() for _ in range(100)]
|
||||
|
Loading…
x
Reference in New Issue
Block a user