mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-12 17:26:43 +00:00
* Reproduce failing behaviour with non-date-time data * Add a presidio patch for DateTimes * Fix type-check error --------- Co-authored-by: Pere Menal <pere.menal@getcollate.io>
This commit is contained in:
parent
324aab71a4
commit
6683c632f4
@ -38,7 +38,11 @@ from metadata.pii.algorithms.feature_extraction import (
|
|||||||
split_column_name,
|
split_column_name,
|
||||||
)
|
)
|
||||||
from metadata.pii.algorithms.preprocessing import preprocess_values
|
from metadata.pii.algorithms.preprocessing import preprocess_values
|
||||||
from metadata.pii.algorithms.presidio_patches import url_patcher
|
from metadata.pii.algorithms.presidio_patches import (
|
||||||
|
combine_patchers,
|
||||||
|
date_time_patcher,
|
||||||
|
url_patcher,
|
||||||
|
)
|
||||||
from metadata.pii.algorithms.presidio_utils import (
|
from metadata.pii.algorithms.presidio_utils import (
|
||||||
build_analyzer_engine,
|
build_analyzer_engine,
|
||||||
set_presidio_logger_level,
|
set_presidio_logger_level,
|
||||||
@ -119,7 +123,7 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
|
|||||||
self._presidio_analyzer,
|
self._presidio_analyzer,
|
||||||
str_values,
|
str_values,
|
||||||
context=context,
|
context=context,
|
||||||
recognizer_result_patcher=url_patcher,
|
recognizer_result_patcher=combine_patchers(date_time_patcher, url_patcher),
|
||||||
)
|
)
|
||||||
|
|
||||||
column_name_matches: Set[PIITag] = set()
|
column_name_matches: Set[PIITag] = set()
|
||||||
|
@ -13,6 +13,7 @@ Patch the Presidio recognizer results to make adapt them to specific use cases.
|
|||||||
"""
|
"""
|
||||||
from typing import List, Protocol, Sequence
|
from typing import List, Protocol, Sequence
|
||||||
|
|
||||||
|
from dateutil.parser import parse
|
||||||
from presidio_analyzer import RecognizerResult
|
from presidio_analyzer import RecognizerResult
|
||||||
|
|
||||||
|
|
||||||
@ -29,6 +30,24 @@ class PresidioRecognizerResultPatcher(Protocol):
|
|||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def combine_patchers(
|
||||||
|
*patchers: PresidioRecognizerResultPatcher,
|
||||||
|
) -> PresidioRecognizerResultPatcher:
|
||||||
|
"""
|
||||||
|
Combine multiple patchers into one.
|
||||||
|
This allows us to apply multiple patches in sequence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def combined_patcher(
|
||||||
|
recognizer_results: Sequence[RecognizerResult], text: str
|
||||||
|
) -> Sequence[RecognizerResult]:
|
||||||
|
for patcher in patchers:
|
||||||
|
recognizer_results = patcher(recognizer_results, text)
|
||||||
|
return recognizer_results
|
||||||
|
|
||||||
|
return combined_patcher
|
||||||
|
|
||||||
|
|
||||||
def url_patcher(
|
def url_patcher(
|
||||||
recognizer_results: Sequence[RecognizerResult], text: str
|
recognizer_results: Sequence[RecognizerResult], text: str
|
||||||
) -> Sequence[RecognizerResult]:
|
) -> Sequence[RecognizerResult]:
|
||||||
@ -43,3 +62,22 @@ def url_patcher(
|
|||||||
continue
|
continue
|
||||||
patched_result.append(result)
|
patched_result.append(result)
|
||||||
return patched_result
|
return patched_result
|
||||||
|
|
||||||
|
|
||||||
|
def date_time_patcher(
|
||||||
|
recognizer_results: Sequence[RecognizerResult], text: str
|
||||||
|
) -> Sequence[RecognizerResult]:
|
||||||
|
"""
|
||||||
|
Patch the recognizer result to remove date time false positive with date.
|
||||||
|
"""
|
||||||
|
patched_result: List[RecognizerResult] = []
|
||||||
|
for result in recognizer_results:
|
||||||
|
if result.entity_type == "DATE_TIME":
|
||||||
|
# try to parse using dateutils, if it fails, skip the result
|
||||||
|
try:
|
||||||
|
_ = parse(text[result.start : result.end])
|
||||||
|
except ValueError:
|
||||||
|
# if parsing fails, skip the result
|
||||||
|
continue
|
||||||
|
patched_result.append(result)
|
||||||
|
return patched_result
|
||||||
|
@ -70,6 +70,19 @@ phone_data: LabeledData = {
|
|||||||
"pii_sensitivity": True,
|
"pii_sensitivity": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
data_time_data: LabeledData = {
|
||||||
|
"column_name": "event_time",
|
||||||
|
"column_data_type": DataType.STRING,
|
||||||
|
"sample_data": [
|
||||||
|
"2023-10-01 12:00:00Z",
|
||||||
|
"2023-10-02 15:30:00Z",
|
||||||
|
"2023-10-03 18:45:00Z",
|
||||||
|
"2023-10-04 21:15:00Z",
|
||||||
|
],
|
||||||
|
"pii_tags": [PIITag.DATE_TIME],
|
||||||
|
"pii_sensitivity": False,
|
||||||
|
}
|
||||||
|
|
||||||
non_pii_text_data: LabeledData = {
|
non_pii_text_data: LabeledData = {
|
||||||
"column_name": "random_text",
|
"column_name": "random_text",
|
||||||
"column_data_type": DataType.STRING,
|
"column_data_type": DataType.STRING,
|
||||||
@ -173,8 +186,26 @@ es_nif_data: LabeledData = {
|
|||||||
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
|
"sample_data": ["48347544A", "08163649Y", "85738706L", "01922869T", "44729355J"],
|
||||||
"pii_tags": [
|
"pii_tags": [
|
||||||
PIITag.ES_NIF,
|
PIITag.ES_NIF,
|
||||||
PIITag.DATE_TIME,
|
|
||||||
PIITag.US_DRIVER_LICENSE, # low score
|
PIITag.US_DRIVER_LICENSE, # low score
|
||||||
],
|
],
|
||||||
"pii_sensitivity": True,
|
"pii_sensitivity": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Sample data for regression tests
|
||||||
|
|
||||||
|
# Previously, this data was incorrectly tagged as PII.DATE_TIME
|
||||||
|
false_positive_datetime_data: LabeledData = {
|
||||||
|
"column_name": None,
|
||||||
|
"column_data_type": DataType.STRING,
|
||||||
|
"sample_data": [
|
||||||
|
"60001",
|
||||||
|
"60002",
|
||||||
|
"60003",
|
||||||
|
"60004",
|
||||||
|
"60005",
|
||||||
|
"60006",
|
||||||
|
"60007",
|
||||||
|
],
|
||||||
|
"pii_tags": [],
|
||||||
|
"pii_sensitivity": False,
|
||||||
|
}
|
||||||
|
@ -16,7 +16,7 @@ from metadata.pii.algorithms.feature_extraction import (
|
|||||||
extract_pii_tags,
|
extract_pii_tags,
|
||||||
split_column_name,
|
split_column_name,
|
||||||
)
|
)
|
||||||
from metadata.pii.algorithms.presidio_patches import url_patcher
|
from metadata.pii.algorithms.presidio_patches import date_time_patcher, url_patcher
|
||||||
from metadata.pii.algorithms.tags import PIITag
|
from metadata.pii.algorithms.tags import PIITag
|
||||||
|
|
||||||
|
|
||||||
@ -133,6 +133,30 @@ def test_person_extraction(fake, analyzer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_date_time_extraction_false_positive_regression(fake, analyzer):
|
||||||
|
"""
|
||||||
|
Regression test for a false positive where a timestamp was incorrectly
|
||||||
|
marked as a date by the Presidio analyzer.
|
||||||
|
"""
|
||||||
|
not_dates = [60001, 60002, 60003, 60004, 60005]
|
||||||
|
not_dates_str = [str(date) for date in not_dates]
|
||||||
|
extracted = extract_pii_tags(
|
||||||
|
analyzer, not_dates_str, recognizer_result_patcher=date_time_patcher
|
||||||
|
)
|
||||||
|
assert PIITag.DATE_TIME not in extracted
|
||||||
|
|
||||||
|
|
||||||
|
def test_date_time_extraction_with_patched_results(fake, analyzer):
|
||||||
|
# Generate a list of dates and times
|
||||||
|
samples = [str(fake.date_time_this_century()) for _ in range(100)]
|
||||||
|
# Patch the results to avoid false positives
|
||||||
|
extracted = extract_pii_tags(
|
||||||
|
analyzer, samples, recognizer_result_patcher=date_time_patcher
|
||||||
|
)
|
||||||
|
|
||||||
|
assert PIITag.DATE_TIME in extracted
|
||||||
|
|
||||||
|
|
||||||
# Extraction with patched URL
|
# Extraction with patched URL
|
||||||
def test_email_address_extraction_does_not_extract_url(fake, analyzer):
|
def test_email_address_extraction_does_not_extract_url(fake, analyzer):
|
||||||
samples = [fake.email() for _ in range(100)]
|
samples = [fake.email() for _ in range(100)]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user