mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-26 17:34:41 +00:00
Increase PIIProcessor._tolerance
This is so we correctly filter out low scores from classifiers while still maintaining the normalization that filters out confusing outcomes. e.g: an output with scores 0.3, 0.7 and 0.75, would initially filter the 0.3 and then discard the other two because they're both relatively high results.
This commit is contained in:
parent
0c080380d0
commit
1332d0aa06
@ -27,7 +27,11 @@ from metadata.generated.schema.type.tagLabel import (
|
|||||||
)
|
)
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.pii.algorithms.tags import PIISensitivityTag
|
from metadata.pii.algorithms.tags import PIISensitivityTag
|
||||||
from metadata.pii.algorithms.utils import build_reason, get_top_classes
|
from metadata.pii.algorithms.utils import (
|
||||||
|
build_reason,
|
||||||
|
get_top_classes,
|
||||||
|
normalize_scores,
|
||||||
|
)
|
||||||
from metadata.pii.base_processor import AutoClassificationProcessor
|
from metadata.pii.base_processor import AutoClassificationProcessor
|
||||||
from metadata.pii.constants import PII
|
from metadata.pii.constants import PII
|
||||||
from metadata.utils import fqn
|
from metadata.utils import fqn
|
||||||
@ -45,6 +49,7 @@ class PIIProcessor(AutoClassificationProcessor):
|
|||||||
self,
|
self,
|
||||||
config: OpenMetadataWorkflowConfig,
|
config: OpenMetadataWorkflowConfig,
|
||||||
metadata: OpenMetadata,
|
metadata: OpenMetadata,
|
||||||
|
tolerance: float = 0.7,
|
||||||
):
|
):
|
||||||
super().__init__(config, metadata)
|
super().__init__(config, metadata)
|
||||||
|
|
||||||
@ -56,10 +61,10 @@ class PIIProcessor(AutoClassificationProcessor):
|
|||||||
self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier()
|
self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier()
|
||||||
|
|
||||||
self.confidence_threshold = self.source_config.confidence / 100
|
self.confidence_threshold = self.source_config.confidence / 100
|
||||||
self._tolerance = 0.01
|
self._tolerance = tolerance
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build_tag_label(tag: PIISensitivityTag, reason: str) -> TagLabel:
|
def build_tag_label(tag: PIISensitivityTag, score: float) -> TagLabel:
|
||||||
tag_fqn = fqn.build(
|
tag_fqn = fqn.build(
|
||||||
metadata=None,
|
metadata=None,
|
||||||
entity_type=Tag,
|
entity_type=Tag,
|
||||||
@ -72,7 +77,7 @@ class PIIProcessor(AutoClassificationProcessor):
|
|||||||
source=TagSource.Classification,
|
source=TagSource.Classification,
|
||||||
state=State.Suggested,
|
state=State.Suggested,
|
||||||
labelType=LabelType.Generated,
|
labelType=LabelType.Generated,
|
||||||
reason=reason,
|
reason=build_reason(tag_fqn, score),
|
||||||
)
|
)
|
||||||
|
|
||||||
return tag_label
|
return tag_label
|
||||||
@ -97,8 +102,5 @@ class PIIProcessor(AutoClassificationProcessor):
|
|||||||
|
|
||||||
# winner is at most 1 tag
|
# winner is at most 1 tag
|
||||||
winner = get_top_classes(scores, 1, self.confidence_threshold)
|
winner = get_top_classes(scores, 1, self.confidence_threshold)
|
||||||
tag_labels = [
|
tag_labels = [self.build_tag_label(tag, scores[tag]) for tag in winner]
|
||||||
self.build_tag_label(tag, build_reason(tag.value, scores[tag]))
|
|
||||||
for tag in winner
|
|
||||||
]
|
|
||||||
return tag_labels
|
return tag_labels
|
||||||
|
@ -312,8 +312,8 @@ class PiiProcessorTest(TestCase):
|
|||||||
self.assertEqual(expected.column_fqn, updated.column_fqn)
|
self.assertEqual(expected.column_fqn, updated.column_fqn)
|
||||||
self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)
|
self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)
|
||||||
self.assertRegex(
|
self.assertRegex(
|
||||||
updated.value,
|
updated.tag_label.reason,
|
||||||
expected_regex=re.compile(
|
expected_regex=re.compile(
|
||||||
f"Chose {expected.tag_label.name} with a classification score of \d+([.,]?\d{{1,2}})?"
|
f"Chose {expected.tag_label.tagFQN.root} with a classification score of \d+([.,]?\d{{1,2}})?"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user