From 1332d0aa069cefb8170cbf766dbf8ef1c176c71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20Do=C3=B1aque?= Date: Tue, 23 Sep 2025 12:47:00 +0200 Subject: [PATCH] Increase `PIIProcessor._tolerance` This is so we correctly filter out low scores from classifiers while still maintaining the normalization that filters out confusing outcomes. e.g: an output with scores 0.3, 0.7 and 0.75, would initially filter the 0.3 and then discard the other two because they're both relatively high results. --- ingestion/src/metadata/pii/processor.py | 18 ++++++++++-------- .../orm_profiler/test_pii_processor.py | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py index 80353d0fb51..67765508b6a 100644 --- a/ingestion/src/metadata/pii/processor.py +++ b/ingestion/src/metadata/pii/processor.py @@ -27,7 +27,11 @@ from metadata.generated.schema.type.tagLabel import ( ) from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.pii.algorithms.tags import PIISensitivityTag -from metadata.pii.algorithms.utils import build_reason, get_top_classes +from metadata.pii.algorithms.utils import ( + build_reason, + get_top_classes, + normalize_scores, +) from metadata.pii.base_processor import AutoClassificationProcessor from metadata.pii.constants import PII from metadata.utils import fqn @@ -45,6 +49,7 @@ class PIIProcessor(AutoClassificationProcessor): self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata, + tolerance: float = 0.7, ): super().__init__(config, metadata) @@ -56,10 +61,10 @@ class PIIProcessor(AutoClassificationProcessor): self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier() self.confidence_threshold = self.source_config.confidence / 100 - self._tolerance = 0.01 + self._tolerance = tolerance @staticmethod - def build_tag_label(tag: PIISensitivityTag, reason: str) -> TagLabel: + def build_tag_label(tag: PIISensitivityTag, score: float) -> TagLabel: tag_fqn = fqn.build( metadata=None, entity_type=Tag, @@ -72,7 +77,7 @@ class PIIProcessor(AutoClassificationProcessor): source=TagSource.Classification, state=State.Suggested, labelType=LabelType.Generated, - reason=reason, + reason=build_reason(tag_fqn, score), ) return tag_label @@ -97,8 +102,5 @@ class PIIProcessor(AutoClassificationProcessor): # winner is at most 1 tag winner = get_top_classes(scores, 1, self.confidence_threshold) - tag_labels = [ - self.build_tag_label(tag, build_reason(tag.value, scores[tag])) - for tag in winner - ] + tag_labels = [self.build_tag_label(tag, scores[tag]) for tag in winner] return tag_labels diff --git a/ingestion/tests/integration/orm_profiler/test_pii_processor.py b/ingestion/tests/integration/orm_profiler/test_pii_processor.py index 44ab978bed0..847c3b12d47 100644 --- a/ingestion/tests/integration/orm_profiler/test_pii_processor.py +++ b/ingestion/tests/integration/orm_profiler/test_pii_processor.py @@ -312,8 +312,8 @@ class PiiProcessorTest(TestCase): self.assertEqual(expected.column_fqn, updated.column_fqn) self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN) self.assertRegex( - updated.value, + updated.tag_label.reason, expected_regex=re.compile( - f"Chose {expected.tag_label.name} with a classification score of \d+([.,]?\d{{1,2}})?" + f"Chose {expected.tag_label.tagFQN.root} with a classification score of \d+([.,]?\d{{1,2}})?" ), )