mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-11-04 04:29:13 +00:00 
			
		
		
		
	Clean NER Scanner imports (#11653)
This commit is contained in:
		
							parent
							
								
									667706d09b
								
							
						
					
					
						commit
						8795337f88
					
				@ -48,6 +48,7 @@ class NERScanner:
 | 
			
		||||
    Based on https://microsoft.github.io/presidio/
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        import spacy
 | 
			
		||||
        from presidio_analyzer import AnalyzerEngine
 | 
			
		||||
        from presidio_analyzer.nlp_engine.spacy_nlp_engine import SpacyNlpEngine
 | 
			
		||||
@ -61,7 +62,9 @@ class NERScanner:
 | 
			
		||||
            download(SPACY_EN_MODEL)
 | 
			
		||||
            spacy.load(SPACY_EN_MODEL)
 | 
			
		||||
 | 
			
		||||
    analyzer = AnalyzerEngine(nlp_engine=SpacyNlpEngine(models={"en": SPACY_EN_MODEL}))
 | 
			
		||||
        self.analyzer = AnalyzerEngine(
 | 
			
		||||
            nlp_engine=SpacyNlpEngine(models={"en": SPACY_EN_MODEL})
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_highest_score_label(
 | 
			
		||||
@ -77,8 +80,7 @@ class NERScanner:
 | 
			
		||||
                most_used_label_occurrence = score[1]
 | 
			
		||||
        return label_score or (None, None)
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def scan(cls, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
 | 
			
		||||
    def scan(self, sample_data_rows: List[Any]) -> Optional[TagAndConfidence]:
 | 
			
		||||
        """
 | 
			
		||||
        Scan the column's sample data rows and look for PII
 | 
			
		||||
        """
 | 
			
		||||
@ -87,7 +89,7 @@ class NERScanner:
 | 
			
		||||
        str_sample_data_rows = [str(row) for row in sample_data_rows if row is not None]
 | 
			
		||||
        for row in str_sample_data_rows:
 | 
			
		||||
            try:
 | 
			
		||||
                results = cls.analyzer.analyze(row, language="en")
 | 
			
		||||
                results = self.analyzer.analyze(row, language="en")
 | 
			
		||||
                for result in results:
 | 
			
		||||
                    logger.debug("Found %s", result.entity_type)
 | 
			
		||||
                    tag = result.entity_type
 | 
			
		||||
@ -104,7 +106,7 @@ class NERScanner:
 | 
			
		||||
                logger.warning(f"Unknown error while processing {row} - {exc}")
 | 
			
		||||
                logger.debug(traceback.format_exc())
 | 
			
		||||
 | 
			
		||||
        label, score = cls.get_highest_score_label(labels_score, str_sample_data_rows)
 | 
			
		||||
        label, score = self.get_highest_score_label(labels_score, str_sample_data_rows)
 | 
			
		||||
        if label and score:
 | 
			
		||||
            tag_type = NEREntity.__members__.get(label, TagType.NONSENSITIVE).value
 | 
			
		||||
            return TagAndConfidence(tag=tag_type, confidence=score)
 | 
			
		||||
 | 
			
		||||
@ -34,6 +34,7 @@ class PIIProcessor:
 | 
			
		||||
    def __init__(self, metadata: OpenMetadata):
 | 
			
		||||
 | 
			
		||||
        self.metadata = metadata
 | 
			
		||||
        self.ner_scanner = NERScanner()
 | 
			
		||||
 | 
			
		||||
    def patch_column_tag(
 | 
			
		||||
        self, tag_type: str, table_entity: Table, column_name: str
 | 
			
		||||
@ -81,7 +82,7 @@ class PIIProcessor:
 | 
			
		||||
 | 
			
		||||
            # Scan by column name. If no results there, check the sample data, if any
 | 
			
		||||
            tag_and_confidence = ColumnNameScanner.scan(column.name.__root__) or (
 | 
			
		||||
                NERScanner.scan([row[idx] for row in table_data.rows])
 | 
			
		||||
                self.ner_scanner.scan([row[idx] for row in table_data.rows])
 | 
			
		||||
                if table_data
 | 
			
		||||
                else None
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
@ -22,10 +22,12 @@ class NERScannerTest(TestCase):
 | 
			
		||||
    Validate various typical column names
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    ner_scanner = NERScanner()
 | 
			
		||||
 | 
			
		||||
    def test_scanner_none(self):
 | 
			
		||||
        self.assertIsNone(NERScanner.scan(list(range(100))))
 | 
			
		||||
        self.assertIsNone(self.ner_scanner.scan(list(range(100))))
 | 
			
		||||
        self.assertIsNone(
 | 
			
		||||
            NERScanner.scan(
 | 
			
		||||
            self.ner_scanner.scan(
 | 
			
		||||
                " ".split(
 | 
			
		||||
                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
 | 
			
		||||
                )
 | 
			
		||||
@ -34,7 +36,7 @@ class NERScannerTest(TestCase):
 | 
			
		||||
 | 
			
		||||
    def test_scanner_sensitive(self):
 | 
			
		||||
        self.assertEqual(
 | 
			
		||||
            NERScanner.scan(
 | 
			
		||||
            self.ner_scanner.scan(
 | 
			
		||||
                [
 | 
			
		||||
                    "geraldc@gmail.com",
 | 
			
		||||
                    "saratimithi@godesign.com",
 | 
			
		||||
@ -44,6 +46,8 @@ class NERScannerTest(TestCase):
 | 
			
		||||
            TagType.SENSITIVE,
 | 
			
		||||
        )
 | 
			
		||||
        self.assertEqual(
 | 
			
		||||
            NERScanner.scan(["im ok", "saratimithi@godesign.com", "not sensitive"]).tag,
 | 
			
		||||
            self.ner_scanner.scan(
 | 
			
		||||
                ["im ok", "saratimithi@godesign.com", "not sensitive"]
 | 
			
		||||
            ).tag,
 | 
			
		||||
            TagType.SENSITIVE,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user