mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-26 01:15:08 +00:00
Increase PIIProcessor._tolerance
This is so we correctly filter out low scores from classifiers while still maintaining the normalization that filters out confusing outcomes. e.g: an output with scores 0.3, 0.7 and 0.75, would initially filter the 0.3 and then discard the other two because they're both relatively high results.
This commit is contained in:
parent
0c080380d0
commit
1332d0aa06
@ -27,7 +27,11 @@ from metadata.generated.schema.type.tagLabel import (
|
||||
)
|
||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||
from metadata.pii.algorithms.tags import PIISensitivityTag
|
||||
from metadata.pii.algorithms.utils import build_reason, get_top_classes
|
||||
from metadata.pii.algorithms.utils import (
|
||||
build_reason,
|
||||
get_top_classes,
|
||||
normalize_scores,
|
||||
)
|
||||
from metadata.pii.base_processor import AutoClassificationProcessor
|
||||
from metadata.pii.constants import PII
|
||||
from metadata.utils import fqn
|
||||
@ -45,6 +49,7 @@ class PIIProcessor(AutoClassificationProcessor):
|
||||
self,
|
||||
config: OpenMetadataWorkflowConfig,
|
||||
metadata: OpenMetadata,
|
||||
tolerance: float = 0.7,
|
||||
):
|
||||
super().__init__(config, metadata)
|
||||
|
||||
@ -56,10 +61,10 @@ class PIIProcessor(AutoClassificationProcessor):
|
||||
self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier()
|
||||
|
||||
self.confidence_threshold = self.source_config.confidence / 100
|
||||
self._tolerance = 0.01
|
||||
self._tolerance = tolerance
|
||||
|
||||
@staticmethod
|
||||
def build_tag_label(tag: PIISensitivityTag, reason: str) -> TagLabel:
|
||||
def build_tag_label(tag: PIISensitivityTag, score: float) -> TagLabel:
|
||||
tag_fqn = fqn.build(
|
||||
metadata=None,
|
||||
entity_type=Tag,
|
||||
@ -72,7 +77,7 @@ class PIIProcessor(AutoClassificationProcessor):
|
||||
source=TagSource.Classification,
|
||||
state=State.Suggested,
|
||||
labelType=LabelType.Generated,
|
||||
reason=reason,
|
||||
reason=build_reason(tag_fqn, score),
|
||||
)
|
||||
|
||||
return tag_label
|
||||
@ -97,8 +102,5 @@ class PIIProcessor(AutoClassificationProcessor):
|
||||
|
||||
# winner is at most 1 tag
|
||||
winner = get_top_classes(scores, 1, self.confidence_threshold)
|
||||
tag_labels = [
|
||||
self.build_tag_label(tag, build_reason(tag.value, scores[tag]))
|
||||
for tag in winner
|
||||
]
|
||||
tag_labels = [self.build_tag_label(tag, scores[tag]) for tag in winner]
|
||||
return tag_labels
|
||||
|
@ -312,8 +312,8 @@ class PiiProcessorTest(TestCase):
|
||||
self.assertEqual(expected.column_fqn, updated.column_fqn)
|
||||
self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)
|
||||
self.assertRegex(
|
||||
updated.value,
|
||||
updated.tag_label.reason,
|
||||
expected_regex=re.compile(
|
||||
f"Chose {expected.tag_label.name} with a classification score of \d+([.,]?\d{{1,2}})?"
|
||||
f"Chose {expected.tag_label.tagFQN.root} with a classification score of \d+([.,]?\d{{1,2}})?"
|
||||
),
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user