From 5d2dfa712ae24356265aabe6319daa7d33c86e5c Mon Sep 17 00:00:00 2001
From: Pere Menal-Ferrer <p.menal@stuart.com>
Date: Mon, 19 May 2025 19:52:17 +0200
Subject: [PATCH] feature/pii-processor-improvement (#21248)

* Add PII Tag and Sensitivity Level enums.

* Add feature-extraction for PII classification tasks

* Add faker as test dependency

* Add unit tests for presidio tag extractor

* Add PIISensitivityTags enum and update sensitivity mapping logic

* Add Presidio utility functions for PII analysis

* Extend column name regexs for PII

* Add tests for PAN, NIF, SSN entities

* Fix version of faker to prevent flaky tests. Fix failing tests.

* Add Generated to State enum

* Integrate PIISensitive classifier to PIIProcessor
---
 .pre-commit-config.yaml                       |   2 +-
 ingestion/pyproject.toml                      |   2 +-
 .../metadata/pii/algorithms/classifiers.py    |   7 +
 .../metadata/pii/algorithms/preprocessing.py  |   4 +-
 .../src/metadata/pii/algorithms/utils.py      |  38 ++++
 ingestion/src/metadata/pii/base_processor.py  | 125 +++++++++++++
 ingestion/src/metadata/pii/processor.py       | 176 ++++--------------
 .../orm_profiler/test_pii_processor.py        |   1 -
 .../tests/unit/pii/test_pii_sensitive.py      |  35 ++++
 9 files changed, 250 insertions(+), 140 deletions(-)
 create mode 100644 ingestion/src/metadata/pii/algorithms/utils.py
 create mode 100644 ingestion/src/metadata/pii/base_processor.py
 create mode 100644 ingestion/tests/unit/pii/test_pii_sensitive.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9b68634eb0d..3bc4bcc9b35 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
       - id: check-json
         exclude: vscode
   - repo: https://github.com/hadialqattan/pycln
-    rev: v2.4.0
+    rev: v2.5.0
     hooks:
       - id: pycln
         files: ^(ingestion|openmetadata-airflow-apis)/
diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml
index d3db292e3af..a6424f1c53b 100644
--- a/ingestion/pyproject.toml
+++ b/ingestion/pyproject.toml
@@ -260,7 +260,7 @@ ignore = [
   "src/metadata/mixins/*",
   "src/metadata/parsers/*",
   "src/metadata/pii/scanners/*",
-  "src/metadata/pii/processor.py",
+  "src/metadata/pii/*processor.py",
   "src/metadata/profiler/*",
   "src/metadata/sampler/*",
   "src/metadata/readers/*",
diff --git a/ingestion/src/metadata/pii/algorithms/classifiers.py b/ingestion/src/metadata/pii/algorithms/classifiers.py
index 483297dfb82..35de6fee3ae 100644
--- a/ingestion/src/metadata/pii/algorithms/classifiers.py
+++ b/ingestion/src/metadata/pii/algorithms/classifiers.py
@@ -164,10 +164,17 @@ class PIISensitiveClassifier(ColumnClassifier[PIISensitivityTag]):
             sample_data, column_name, column_data_type
         )
         results: DefaultDict[PIISensitivityTag, float] = defaultdict(float)
+        counts: DefaultDict[PIISensitivityTag, int] = defaultdict(int)
 
         for tag, score in pii_tags.items():
             # Convert PIITag to PIISensitivityTag
             pii_sensitivity = tag.sensitivity()
             results[pii_sensitivity] += score
+            counts[pii_sensitivity] += 1
+
+        # Normalize the scores
+        for tag in results:
+            if counts[tag] > 0:
+                results[tag] /= counts[tag]
 
         return results
diff --git a/ingestion/src/metadata/pii/algorithms/preprocessing.py b/ingestion/src/metadata/pii/algorithms/preprocessing.py
index ab1938dee87..8f7080193bd 100644
--- a/ingestion/src/metadata/pii/algorithms/preprocessing.py
+++ b/ingestion/src/metadata/pii/algorithms/preprocessing.py
@@ -11,6 +11,7 @@
 """
 Preprocessing functions for the classification tasks.
 """
+import datetime
 import json
 from typing import Any, List, Mapping, Optional, Sequence
 
@@ -27,7 +28,8 @@ def convert_to_str(value: Any) -> Optional[str]:
     """
     if isinstance(value, str):
         return value
-    if isinstance(value, (int, float)):
+    if isinstance(value, (int, float, datetime.datetime, datetime.date)):
+        # Values we want to convert to string out of the box
         return str(value)
     if isinstance(value, bytes):
         return value.decode("utf-8", errors="ignore")
diff --git a/ingestion/src/metadata/pii/algorithms/utils.py b/ingestion/src/metadata/pii/algorithms/utils.py
new file mode 100644
index 00000000000..cf482336c9d
--- /dev/null
+++ b/ingestion/src/metadata/pii/algorithms/utils.py
@@ -0,0 +1,38 @@
+#  Copyright 2025 Collate
+#  Licensed under the Collate Community License, Version 1.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Utility functions for PII algorithms
+"""
+from typing import Mapping, Sequence, TypeVar
+
+T = TypeVar("T")
+
+
+def normalize_scores(scores: Mapping[T, float], tol: float = 0.01) -> Mapping[T, float]:
+    """
+    Normalize the scores to sum to 1, while ignoring scores below the tolerance.
+    Scores must be positive.
+    """
+    scores = {key: score for key, score in scores.items() if score > tol}
+    total = sum(scores.values())
+    if total == 0:
+        return scores
+    return {key: score / total for key, score in scores.items()}
+
+
+def get_top_classes(scores: Mapping[T, float], n: int, threshold: float) -> Sequence[T]:
+    """
+    Get the top n scores from the scores mapping that are above the threshold.
+    The classes are sorted in descending order of their scores.
+    """
+    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    top_classes = [key for key, score in sorted_scores if score >= threshold]
+    return top_classes[:n]
diff --git a/ingestion/src/metadata/pii/base_processor.py b/ingestion/src/metadata/pii/base_processor.py
new file mode 100644
index 00000000000..0d94178b0e2
--- /dev/null
+++ b/ingestion/src/metadata/pii/base_processor.py
@@ -0,0 +1,125 @@
+#  Copyright 2025 Collate
+#  Licensed under the Collate Community License, Version 1.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Base class for the Auto Classification Processor.
+"""
+import traceback
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Sequence, Type, TypeVar, cast, final
+
+from metadata.generated.schema.entity.data.table import Column
+from metadata.generated.schema.entity.services.ingestionPipelines.status import (
+    StackTraceError,
+)
+from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
+    DatabaseServiceAutoClassificationPipeline,
+)
+from metadata.generated.schema.metadataIngestion.workflow import (
+    OpenMetadataWorkflowConfig,
+)
+from metadata.generated.schema.type.tagLabel import TagLabel
+from metadata.ingestion.api.models import Either
+from metadata.ingestion.api.parser import parse_workflow_config_gracefully
+from metadata.ingestion.api.steps import Processor
+from metadata.ingestion.models.table_metadata import ColumnTag
+from metadata.ingestion.ometa.ometa_api import OpenMetadata
+from metadata.sampler.models import SamplerResponse
+
+C = TypeVar("C", bound="AutoClassificationProcessor")
+
+
+class AutoClassificationProcessor(Processor, ABC):
+    """
+    Abstract class for the Auto Classification Processor.
+
+    Implementations should only provide the logic for creating tags based on sample data,
+    and rely on the running part to be handled by the base class.
+    """
+
+    # Some methods are marked as final to prevent overriding in subclasses thus
+    # ensuring that the workflow is always run in the same way keeping implementer
+    # with the responsibility of *only* implementing the logic for creating tags.
+    def __init__(
+        self,
+        config: OpenMetadataWorkflowConfig,
+        metadata: OpenMetadata,
+    ):
+        super().__init__()
+        self.config = config
+        self.metadata = metadata
+
+        # Init and type the source config
+        self.source_config: DatabaseServiceAutoClassificationPipeline = cast(
+            DatabaseServiceAutoClassificationPipeline,
+            self.config.source.sourceConfig.config,
+        )  # Used to satisfy type checked
+
+    @abstractmethod
+    def create_column_tag_labels(
+        self, column: Column, sample_data: Sequence[Any]
+    ) -> Sequence[TagLabel]:
+        """
+        Create tags for the column based on the sample data.
+        """
+
+    @property
+    def name(self) -> str:
+        return "Auto Classification Processor"
+
+    def close(self) -> None:
+        """Nothing to close"""
+
+    @classmethod
+    @final
+    def create(
+        cls: Type[C],
+        config_dict: dict,
+        metadata: OpenMetadata,
+        pipeline_name: Optional[str] = None,
+    ) -> C:
+        config = parse_workflow_config_gracefully(config_dict)
+        return cls(config=config, metadata=metadata)
+
+    @final
+    def _run(self, record: SamplerResponse) -> Either[SamplerResponse]:
+        """
+        Main entrypoint for the processor.
+        """
+
+        # We don't always need to process
+        if not self.source_config.enableAutoClassification:
+            return Either(right=record, left=None)
+
+        column_tags = []
+
+        for idx, column in enumerate(record.table.columns):
+            try:
+                tags = self.create_column_tag_labels(
+                    column=column,
+                    sample_data=[row[idx] for row in record.sample_data.data.rows],
+                )
+                for tag in tags:
+                    column_tag = ColumnTag(
+                        column_fqn=column.fullyQualifiedName.root, tag_label=tag
+                    )
+                    column_tags.append(column_tag)
+            except Exception as err:
+                # TODO: Shouldn't we return a Left here?
+                self.status.failed(
+                    StackTraceError(
+                        name=record.table.fullyQualifiedName.root,
+                        error=f"Error in Processor {self.name} computing tags for [{column}] - [{err}]",
+                        stackTrace=traceback.format_exc(),
+                    )
+                )
+
+        record.column_tags = column_tags
+        return Either(right=record, left=None)
diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py
index 3cc396496ec..bd7b3d8a5cd 100644
--- a/ingestion/src/metadata/pii/processor.py
+++ b/ingestion/src/metadata/pii/processor.py
@@ -12,16 +12,10 @@
 """
 Processor util to fetch pii sensitive columns
 """
-import traceback
-from typing import List, Optional, cast
+from typing import Any, Sequence
 
-from metadata.generated.schema.entity.data.table import Column, TableData
-from metadata.generated.schema.entity.services.ingestionPipelines.status import (
-    StackTraceError,
-)
-from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import (
-    DatabaseServiceAutoClassificationPipeline,
-)
+from metadata.generated.schema.entity.classification.tag import Tag
+from metadata.generated.schema.entity.data.table import Column
 from metadata.generated.schema.metadataIngestion.workflow import (
     OpenMetadataWorkflowConfig,
 )
@@ -31,24 +25,21 @@ from metadata.generated.schema.type.tagLabel import (
     TagLabel,
     TagSource,
 )
-from metadata.ingestion.api.models import Either
-from metadata.ingestion.api.parser import parse_workflow_config_gracefully
-from metadata.ingestion.api.step import Step
-from metadata.ingestion.api.steps import Processor
-from metadata.ingestion.models.table_metadata import ColumnTag
 from metadata.ingestion.ometa.ometa_api import OpenMetadata
+from metadata.pii.algorithms.classifiers import ColumnClassifier, PIISensitiveClassifier
+from metadata.pii.algorithms.tags import PIISensitivityTag
+from metadata.pii.algorithms.utils import get_top_classes, normalize_scores
+from metadata.pii.base_processor import AutoClassificationProcessor
 from metadata.pii.constants import PII
-from metadata.pii.scanners.column_name_scanner import ColumnNameScanner
-from metadata.pii.scanners.ner_scanner import NERScanner
-from metadata.sampler.models import SamplerResponse
+from metadata.utils import fqn
 from metadata.utils.logger import profiler_logger
 
 logger = profiler_logger()
 
 
-class PIIProcessor(Processor):
+class PIIProcessor(AutoClassificationProcessor):
     """
-    A scanner that uses Spacy NER for entity recognition
+    An AutoClassificationProcessor that uses a PIISensitive classifier to tag columns.
     """
 
     def __init__(
@@ -56,50 +47,21 @@ class PIIProcessor(Processor):
         config: OpenMetadataWorkflowConfig,
         metadata: OpenMetadata,
     ):
-        super().__init__()
-        self.config = config
-        self.metadata = metadata
+        super().__init__(config, metadata)
+        self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier()
 
-        # Init and type the source config
-        self.source_config: DatabaseServiceAutoClassificationPipeline = cast(
-            DatabaseServiceAutoClassificationPipeline,
-            self.config.source.sourceConfig.config,
-        )  # Used to satisfy type checked
-
-        self._ner_scanner = None
-        self.name_scanner = ColumnNameScanner()
-        self.confidence_threshold = self.source_config.confidence
-
-    @property
-    def name(self) -> str:
-        return "Auto Classification Processor"
-
-    @property
-    def ner_scanner(self) -> NERScanner:
-        """Load the NER Scanner only if called"""
-        if self._ner_scanner is None:
-            self._ner_scanner = NERScanner()
-
-        return self._ner_scanner
-
-    @classmethod
-    def create(
-        cls,
-        config_dict: dict,
-        metadata: OpenMetadata,
-        pipeline_name: Optional[str] = None,
-    ) -> "Step":
-        config = parse_workflow_config_gracefully(config_dict)
-        return cls(config=config, metadata=metadata)
-
-    def close(self) -> None:
-        """Nothing to close"""
+        self.confidence_threshold = self.source_config.confidence / 100
+        self._tolerance = 0.01
 
     @staticmethod
-    def build_column_tag(tag_fqn: str, column_fqn: str) -> ColumnTag:
-        """
-        Build the tag and run the PATCH
-        """
+    def build_tag_label(tag: PIISensitivityTag) -> TagLabel:
+        tag_fqn = fqn.build(
+            metadata=None,
+            entity_type=Tag,
+            classification_name=PII,
+            tag_name=tag.value,
+        )
+
         tag_label = TagLabel(
             tagFQN=tag_fqn,
             source=TagSource.Classification,
@@ -107,85 +69,27 @@ class PIIProcessor(Processor):
             labelType=LabelType.Generated,
         )
 
-        return ColumnTag(column_fqn=column_fqn, tag_label=tag_label)
+        return tag_label
 
-    def process_column(
-        self,
-        idx: int,
-        column: Column,
-        table_data: Optional[TableData],
-        confidence_threshold: float,
-    ) -> Optional[List[ColumnTag]]:
+    def create_column_tag_labels(
+        self, column: Column, sample_data: Sequence[Any]
+    ) -> Sequence[TagLabel]:
         """
-        Tag a column with PII if we find it using our scanners
+        Create tags for the column based on the sample data.
         """
+        # If the column we are about to process already has PII tags return empty
+        for tag in column.tags or []:
+            if PII in tag.tagFQN.root:
+                return []
 
-        # First, check if the column we are about to process
-        # already has PII tags or not
-        column_has_pii_tag = any((PII in tag.tagFQN.root for tag in column.tags or []))
+        # Get the tags and confidence
+        scores = self._classifier.predict_scores(
+            sample_data, column_name=column.name.root, column_data_type=column.dataType
+        )
 
-        # If it has PII tags, we skip the processing
-        # for the column
-        if column_has_pii_tag is True:
-            return None
+        scores = normalize_scores(scores, tol=self._tolerance)
 
-        # We'll scan first by sample data to prioritize the NER scanner
-        # If we find nothing, we'll check the column name
-        tag_and_confidence = (
-            self.ner_scanner.scan([row[idx] for row in table_data.rows])
-            if table_data
-            else None
-        ) or self.name_scanner.scan(column.name.root)
-
-        if (
-            tag_and_confidence
-            and tag_and_confidence.tag_fqn
-            and tag_and_confidence.confidence >= confidence_threshold / 100
-        ):
-            # We support returning +1 tags for a single column in _run
-            return [
-                self.build_column_tag(
-                    tag_fqn=tag_and_confidence.tag_fqn,
-                    column_fqn=column.fullyQualifiedName.root,
-                )
-            ]
-
-        return None
-
-    def _run(
-        self,
-        record: SamplerResponse,
-    ) -> Either[SamplerResponse]:
-        """
-        Main entrypoint for the scanner.
-
-        Adds PII tagging based on the column names
-        and TableData
-        """
-
-        # We don't always need to process
-        if not self.source_config.enableAutoClassification:
-            return Either(right=record)
-
-        column_tags = []
-        for idx, column in enumerate(record.table.columns):
-            try:
-                col_tags = self.process_column(
-                    idx=idx,
-                    column=column,
-                    table_data=record.sample_data.data,
-                    confidence_threshold=self.confidence_threshold,
-                )
-                if col_tags:
-                    column_tags.extend(col_tags)
-            except Exception as err:
-                self.status.failed(
-                    StackTraceError(
-                        name=record.table.fullyQualifiedName.root,
-                        error=f"Error computing PII tags for [{column}] - [{err}]",
-                        stackTrace=traceback.format_exc(),
-                    )
-                )
-
-        record.column_tags = column_tags
-        return Either(right=record)
+        # winner is at most 1 tag
+        winner = get_top_classes(scores, 1, self.confidence_threshold)
+        tag_labels = [self.build_tag_label(tag) for tag in winner]
+        return tag_labels
diff --git a/ingestion/tests/integration/orm_profiler/test_pii_processor.py b/ingestion/tests/integration/orm_profiler/test_pii_processor.py
index 1c480766855..76c03c37f01 100644
--- a/ingestion/tests/integration/orm_profiler/test_pii_processor.py
+++ b/ingestion/tests/integration/orm_profiler/test_pii_processor.py
@@ -307,7 +307,6 @@ class PiiProcessorTest(TestCase):
         )
 
         updated_record: ProfilerResponse = self.pii_processor.run(record)
-
         for expected, updated in zip(EXPECTED_COLUMN_TAGS, updated_record.column_tags):
             self.assertEqual(expected.column_fqn, updated.column_fqn)
             self.assertEqual(expected.tag_label.tagFQN, updated.tag_label.tagFQN)
diff --git a/ingestion/tests/unit/pii/test_pii_sensitive.py b/ingestion/tests/unit/pii/test_pii_sensitive.py
new file mode 100644
index 00000000000..dd532f4e93b
--- /dev/null
+++ b/ingestion/tests/unit/pii/test_pii_sensitive.py
@@ -0,0 +1,35 @@
+#  Copyright 2025 Collate
+#  Licensed under the Collate Community License, Version 1.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from metadata.generated.schema.type.tagLabel import LabelType, State, TagSource
+from metadata.pii.algorithms.tags import PIISensitivityTag
+from metadata.pii.processor import PIIProcessor
+
+
+def test_pii_processor_build_tag_label_for_pii_sensitive():
+
+    tag = PIISensitivityTag.SENSITIVE
+    tag_label = PIIProcessor.build_tag_label(tag)
+
+    assert tag_label.tagFQN.root == "PII.Sensitive"
+    assert tag_label.source == TagSource.Classification
+    assert tag_label.state == State.Suggested
+    assert tag_label.labelType == LabelType.Generated
+
+
+def test_pii_processor_build_tag_label_for_pii_nonsensitive():
+    tag = PIISensitivityTag.NONSENSITIVE
+    tag_label = PIIProcessor.build_tag_label(tag)
+
+    assert tag_label.tagFQN.root == "PII.NonSensitive"
+    assert tag_label.source == TagSource.Classification
+    assert tag_label.state == State.Suggested
+    assert tag_label.labelType == LabelType.Generated