mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-06 21:43:37 +00:00
Clean NER Scanner imports (#11653)
This commit is contained in:
parent
667706d09b
commit
8795337f88
@ -48,6 +48,7 @@ class NERScanner:
|
|||||||
Based on https://microsoft.github.io/presidio/
|
Based on https://microsoft.github.io/presidio/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
import spacy
|
import spacy
|
||||||
from presidio_analyzer import AnalyzerEngine
|
from presidio_analyzer import AnalyzerEngine
|
||||||
from presidio_analyzer.nlp_engine.spacy_nlp_engine import SpacyNlpEngine
|
from presidio_analyzer.nlp_engine.spacy_nlp_engine import SpacyNlpEngine
|
||||||
@ -61,7 +62,9 @@ class NERScanner:
|
|||||||
download(SPACY_EN_MODEL)
|
download(SPACY_EN_MODEL)
|
||||||
spacy.load(SPACY_EN_MODEL)
|
spacy.load(SPACY_EN_MODEL)
|
||||||
|
|
||||||
analyzer = AnalyzerEngine(nlp_engine=SpacyNlpEngine(models={"en": SPACY_EN_MODEL}))
|
self.analyzer = AnalyzerEngine(
|
||||||
|
nlp_engine=SpacyNlpEngine(models={"en": SPACY_EN_MODEL})
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_highest_score_label(
|
def get_highest_score_label(
|
||||||
@ -77,8 +80,7 @@ class NERScanner:
|
|||||||
most_used_label_occurrence = score[1]
|
most_used_label_occurrence = score[1]
|
||||||
return label_score or (None, None)
|
return label_score or (None, None)
|
||||||
|
|
||||||
@classmethod
|
def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
|
||||||
def scan(cls, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
|
|
||||||
"""
|
"""
|
||||||
Scan the column's sample data rows and look for PII
|
Scan the column's sample data rows and look for PII
|
||||||
"""
|
"""
|
||||||
@ -87,7 +89,7 @@ class NERScanner:
|
|||||||
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
|
str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
|
||||||
for row in str_sample_data_rows:
|
for row in str_sample_data_rows:
|
||||||
try:
|
try:
|
||||||
results = cls.analyzer.analyze(row, language="en")
|
results = self.analyzer.analyze(row, language="en")
|
||||||
for result in results:
|
for result in results:
|
||||||
logger.debug("Found %s", result.entity_type)
|
logger.debug("Found %s", result.entity_type)
|
||||||
tag = result.entity_type
|
tag = result.entity_type
|
||||||
@ -104,7 +106,7 @@ class NERScanner:
|
|||||||
logger.warning(f"Unknown error while processing {row} - {exc}")
|
logger.warning(f"Unknown error while processing {row} - {exc}")
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
label, score = cls.get_highest_score_label(labels_score, str_sample_data_rows)
|
label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
|
||||||
if label and score:
|
if label and score:
|
||||||
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
|
tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
|
||||||
return TagAndConfidence(tag=tag_type, confidence=score)
|
return TagAndConfidence(tag=tag_type, confidence=score)
|
||||||
|
|||||||
@ -34,6 +34,7 @@ class PIIProcessor:
|
|||||||
def __init__(self, metadata: OpenMetadata):
|
def __init__(self, metadata: OpenMetadata):
|
||||||
|
|
||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
|
self.ner_scanner = NERScanner()
|
||||||
|
|
||||||
def patch_column_tag(
|
def patch_column_tag(
|
||||||
self, tag_type: str, table_entity: Table, column_name: str
|
self, tag_type: str, table_entity: Table, column_name: str
|
||||||
@ -81,7 +82,7 @@ class PIIProcessor:
|
|||||||
|
|
||||||
# Scan by column name. If no results there, check the sample data, if any
|
# Scan by column name. If no results there, check the sample data, if any
|
||||||
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
|
tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
|
||||||
NERScanner.scan([row[idx] for row in table_data.rows])
|
self.ner_scanner.scan([row[idx] for row in table_data.rows])
|
||||||
if table_data
|
if table_data
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|||||||
@ -22,10 +22,12 @@ class NERScannerTest(TestCase):
|
|||||||
Validate various typical column names
|
Validate various typical column names
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
ner_scanner = NERScanner()
|
||||||
|
|
||||||
def test_scanner_none(self):
|
def test_scanner_none(self):
|
||||||
self.assertIsNone(NERScanner.scan(list(range(100))))
|
self.assertIsNone(self.ner_scanner.scan(list(range(100))))
|
||||||
self.assertIsNone(
|
self.assertIsNone(
|
||||||
NERScanner.scan(
|
self.ner_scanner.scan(
|
||||||
" ".split(
|
" ".split(
|
||||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
|
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
|
||||||
)
|
)
|
||||||
@ -34,7 +36,7 @@ class NERScannerTest(TestCase):
|
|||||||
|
|
||||||
def test_scanner_sensitive(self):
|
def test_scanner_sensitive(self):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
NERScanner.scan(
|
self.ner_scanner.scan(
|
||||||
[
|
[
|
||||||
"geraldc@gmail.com",
|
"geraldc@gmail.com",
|
||||||
"saratimithi@godesign.com",
|
"saratimithi@godesign.com",
|
||||||
@ -44,6 +46,8 @@ class NERScannerTest(TestCase):
|
|||||||
TagType.SENSITIVE,
|
TagType.SENSITIVE,
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
NERScanner.scan(["im ok", "saratimithi@godesign.com", "not sensitive"]).tag,
|
self.ner_scanner.scan(
|
||||||
|
["im ok", "saratimithi@godesign.com", "not sensitive"]
|
||||||
|
).tag,
|
||||||
TagType.SENSITIVE,
|
TagType.SENSITIVE,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user