Increase PIIProcessor._tolerance

This is so we correctly filter out low scores from classifiers while still maintaining the normalization that filters out confusing outcomes.

e.g: an output with scores 0.3, 0.7 and 0.75, would initially filter the 0.3 and then discard the other two because they're both relatively high results.
This commit is contained in:
Eugenio Doñaque 2025-09-23 12:47:00 +02:00 committed by Eugenio
parent 0c080380d0
commit 1332d0aa06
2 changed files with 12 additions and 10 deletions

View File

@ -27,7 +27,11 @@ from metadata.generated.schema.type.tagLabel import (
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.pii.algorithms.tags import PIISensitivityTag
from metadata.pii.algorithms.utils import build_reason, get_top_classes
from metadata.pii.algorithms.utils import (
build_reason,
get_top_classes,
normalize_scores,
)
from metadata.pii.base_processor import AutoClassificationProcessor
from metadata.pii.constants import PII
from metadata.utils import fqn
@ -45,6 +49,7 @@ class PIIProcessor(AutoClassificationProcessor):
self,
config: OpenMetadataWorkflowConfig,
metadata: OpenMetadata,
tolerance: float = 0.7,
):
super().__init__(config, metadata)
@ -56,10 +61,10 @@ class PIIProcessor(AutoClassificationProcessor):
self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier()
self.confidence_threshold = self.source_config.confidence / 100
self._tolerance = 0.01
self._tolerance = tolerance
@staticmethod
def build_tag_label(tag: PIISensitivityTag, reason: str) -> TagLabel:
def build_tag_label(tag: PIISensitivityTag, score: float) -> TagLabel:
tag_fqn = fqn.build(
metadata=None,
entity_type=Tag,
@ -72,7 +77,7 @@ class PIIProcessor(AutoClassificationProcessor):
source=TagSource.Classification,
state=State.Suggested,
labelType=LabelType.Generated,
reason=reason,
reason=build_reason(tag_fqn, score),
)
return tag_label
@ -97,8 +102,5 @@ class PIIProcessor(AutoClassificationProcessor):
# winner is at most 1 tag
winner = get_top_classes(scores, 1, self.confidence_threshold)
tag_labels = [
self.build_tag_label(tag, build_reason(tag.value, scores[tag]))
for tag in winner
]
tag_labels = [self.build_tag_label(tag, scores[tag]) for tag in winner]
return tag_labels

View File

@ -312,8 +312,8 @@ class PiiProcessorTest(TestCase):
self.assertEqual(expected.column_fqn, updated.column_fqn)
self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)
self.assertRegex(
updated.value,
updated.tag_label.reason,
expected_regex=re.compile(
f"Chose {expected.tag_label.name} with a classification score of \d+([.,]?\d{{1,2}})?"
f"Chose {expected.tag_label.tagFQN.root} with a classification score of \d+([.,]?\d{{1,2}})?"
),
)