From 1e86f9870fd663122b9bbb64f3cf17cf32619c7f Mon Sep 17 00:00:00 2001
From: Pere Menal-Ferrer
Date: Tue, 10 Jun 2025 01:05:35 +0200
Subject: [PATCH] FIX #1464 (#21520)
* Add PIICategoryTags and some utilities on top of them.
* Fix static-check
* Add test for fqn representation
* Add NEREntityGeneralTags.json from Collate
* Add test to check PIICategoryTags agree with the ones used by OM server
* Add LabelExtractor
* Fix style
* Add ignore superflous-parens for pylint
* Ass comment as per PR review
* Fix not-updated PII-IT
* Remove duplicated IT test for PII
---------
Co-authored-by: Pere Menal
Co-authored-by: Sriharsha Chintalapani
---
ingestion/pyproject.toml | 1 -
.../{classifiers.py => column_labelers.py} | 55 +-----
.../pii/algorithms/feature_extraction.py | 2 +-
.../pii/algorithms/label_extractors.py | 110 +++++++++++
.../metadata/pii/algorithms/scoring_ops.py | 74 +++++++
ingestion/src/metadata/pii/algorithms/tags.py | 71 +++++--
.../src/metadata/pii/algorithms/tags_ops.py | 137 +++++++++++++
.../src/metadata/pii/algorithms/utils.py | 38 ----
ingestion/src/metadata/pii/base_processor.py | 1 -
ingestion/src/metadata/pii/constants.py | 2 -
ingestion/src/metadata/pii/models.py | 26 ---
ingestion/src/metadata/pii/ner.py | 85 --------
ingestion/src/metadata/pii/processor.py | 109 +++++++----
ingestion/src/metadata/pii/scanners/base.py | 23 ---
.../pii/scanners/column_name_scanner.py | 84 --------
.../src/metadata/pii/scanners/ner_scanner.py | 185 ------------------
ingestion/tests/__init__.py | 5 +
ingestion/tests/integration/pii/__init__.py | 0
.../test_pii_processor.py | 27 +++
.../unit/pii/algorithms/test_classifiers.py | 6 +-
.../pii/algorithms/test_label_extractors.py | 45 +++++
.../pii/algorithms/test_presidio_utils.py | 2 +-
.../tests/unit/pii/algorithms/test_tags.py | 22 +++
.../unit/pii/algorithms/test_tags_ops.py | 39 ++++
.../unit/pii/test_column_name_scanner.py | 66 -------
ingestion/tests/unit/pii/test_ner_scanner.py | 165 ----------------
.../tests/unit/pii/test_pii_sensitive.py | 35 ----
ingestion/tests/unit/pii/test_processor.py | 33 ++++
.../json/data/tags/NEREntityGeneralTags.json | 94 +++++++++
29 files changed, 730 insertions(+), 812 deletions(-)
rename ingestion/src/metadata/pii/algorithms/{classifiers.py => column_labelers.py} (72%)
create mode 100644 ingestion/src/metadata/pii/algorithms/label_extractors.py
create mode 100644 ingestion/src/metadata/pii/algorithms/scoring_ops.py
create mode 100644 ingestion/src/metadata/pii/algorithms/tags_ops.py
delete mode 100644 ingestion/src/metadata/pii/algorithms/utils.py
delete mode 100644 ingestion/src/metadata/pii/models.py
delete mode 100644 ingestion/src/metadata/pii/ner.py
delete mode 100644 ingestion/src/metadata/pii/scanners/base.py
delete mode 100644 ingestion/src/metadata/pii/scanners/column_name_scanner.py
delete mode 100644 ingestion/src/metadata/pii/scanners/ner_scanner.py
create mode 100644 ingestion/tests/integration/pii/__init__.py
rename ingestion/tests/integration/{orm_profiler => pii}/test_pii_processor.py (92%)
create mode 100644 ingestion/tests/unit/pii/algorithms/test_label_extractors.py
create mode 100644 ingestion/tests/unit/pii/algorithms/test_tags.py
create mode 100644 ingestion/tests/unit/pii/algorithms/test_tags_ops.py
delete mode 100644 ingestion/tests/unit/pii/test_column_name_scanner.py
delete mode 100644 ingestion/tests/unit/pii/test_ner_scanner.py
delete mode 100644 ingestion/tests/unit/pii/test_pii_sensitive.py
create mode 100644 ingestion/tests/unit/pii/test_processor.py
create mode 100644 openmetadata-service/src/main/resources/json/data/tags/NEREntityGeneralTags.json
diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml
index 763c9f8ef18..6cbf8672ba6 100644
--- a/ingestion/pyproject.toml
+++ b/ingestion/pyproject.toml
@@ -259,7 +259,6 @@ ignore = [
"src/metadata/ingestion/*",
"src/metadata/mixins/*",
"src/metadata/parsers/*",
- "src/metadata/pii/scanners/*",
"src/metadata/pii/*processor.py",
"src/metadata/profiler/*",
"src/metadata/sampler/*",
diff --git a/ingestion/src/metadata/pii/algorithms/classifiers.py b/ingestion/src/metadata/pii/algorithms/column_labelers.py
similarity index 72%
rename from ingestion/src/metadata/pii/algorithms/classifiers.py
rename to ingestion/src/metadata/pii/algorithms/column_labelers.py
index bfa6c621bfc..97adb0efaca 100644
--- a/ingestion/src/metadata/pii/algorithms/classifiers.py
+++ b/ingestion/src/metadata/pii/algorithms/column_labelers.py
@@ -12,10 +12,8 @@
Classifier for PII detection and sensitivity tagging.
"""
from abc import ABC, abstractmethod
-from collections import defaultdict
from typing import (
Any,
- DefaultDict,
Dict,
Generic,
Hashable,
@@ -47,12 +45,12 @@ from metadata.pii.algorithms.presidio_utils import (
build_analyzer_engine,
set_presidio_logger_level,
)
-from metadata.pii.algorithms.tags import PIISensitivityTag, PIITag
+from metadata.pii.algorithms.tags import PIITag
T = TypeVar("T", bound=Hashable)
-class ColumnClassifier(ABC, Generic[T]):
+class ColumnLabeler(ABC, Generic[T]):
"""
Base class for column classifiers.
This class defines the interface for classifiers that predict the class
@@ -77,7 +75,7 @@ class ColumnClassifier(ABC, Generic[T]):
@final
-class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
+class HeuristicPIILabeler(ColumnLabeler[PIITag]):
"""
Heuristic PII Column Classifier
"""
@@ -140,45 +138,12 @@ class HeuristicPIIClassifier(ColumnClassifier[PIITag]):
if tag in column_name_matches:
final_score += self._column_name_contribution
# Apply the score cutoff
- if final_score >= self._score_cutoff:
- final_results[tag] = final_score
+ if final_score < self._score_cutoff:
+ continue
+ final_results[tag] = final_score
+
+ # Make sure all scores are capped at 1.0
+ for tag in final_results:
+ final_results[tag] = min(final_results[tag], 1.0)
return final_results
-
-
-class PIISensitiveClassifier(ColumnClassifier[PIISensitivityTag]):
- """
- Implements a classifier for PII sensitivity tags based on a given
- PII column classifier. If no classifier is provided, it defaults to
- using the HeuristicPIIColumnClassifier.
- """
-
- def __init__(self, classifier: Optional[ColumnClassifier[PIITag]] = None):
- self.classifier: ColumnClassifier[PIITag] = (
- classifier or HeuristicPIIClassifier()
- )
-
- def predict_scores(
- self,
- sample_data: Sequence[Any],
- column_name: Optional[str] = None,
- column_data_type: Optional[DataType] = None,
- ) -> Mapping[PIISensitivityTag, float]:
- pii_tags = self.classifier.predict_scores(
- sample_data, column_name, column_data_type
- )
- results: DefaultDict[PIISensitivityTag, float] = defaultdict(float)
- counts: DefaultDict[PIISensitivityTag, int] = defaultdict(int)
-
- for tag, score in pii_tags.items():
- # Convert PIITag to PIISensitivityTag
- pii_sensitivity = tag.sensitivity()
- results[pii_sensitivity] += score
- counts[pii_sensitivity] += 1
-
- # Normalize the scores
- for tag in results:
- if counts[tag] > 0:
- results[tag] /= counts[tag]
-
- return results
diff --git a/ingestion/src/metadata/pii/algorithms/feature_extraction.py b/ingestion/src/metadata/pii/algorithms/feature_extraction.py
index 4a5b3a644bc..d24fc1a15ab 100644
--- a/ingestion/src/metadata/pii/algorithms/feature_extraction.py
+++ b/ingestion/src/metadata/pii/algorithms/feature_extraction.py
@@ -22,7 +22,7 @@ from presidio_analyzer import AnalyzerEngine
from metadata.generated.schema.entity.data.table import DataType
from metadata.pii.algorithms.presidio_patches import PresidioRecognizerResultPatcher
from metadata.pii.algorithms.tags import PIITag
-from metadata.pii.scanners.ner_scanner import SUPPORTED_LANG
+from metadata.pii.constants import SUPPORTED_LANG
from metadata.utils.logger import pii_logger
logger = pii_logger()
diff --git a/ingestion/src/metadata/pii/algorithms/label_extractors.py b/ingestion/src/metadata/pii/algorithms/label_extractors.py
new file mode 100644
index 00000000000..50b320ca58e
--- /dev/null
+++ b/ingestion/src/metadata/pii/algorithms/label_extractors.py
@@ -0,0 +1,110 @@
+# Copyright 2025 Collate
+# Licensed under the Collate Community License, Version 1.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Label extractor interface and implementations.
+"""
+from abc import ABC, abstractmethod
+from typing import Generic, Mapping, Set, TypeVar, final
+
+from metadata.pii.algorithms.scoring_ops import scores_cleanup, scores_to_probabilities
+
+T = TypeVar("T")
+
+
+class LabelExtractor(ABC, Generic[T]):
+ """
+ Protocol for extracting labels from a mapping of label scores.
+
+ This goal is to abstract the logic of how labels are extracted
+ from the scores, allowing different strategies to be implemented
+ depending on the underlying algorithm or use-case.
+ """
+
+ @abstractmethod
+ def extract_labels(self, scores: Mapping[T, float]) -> Set[T]:
+ """
+ Extract labels from the given scores mapping.
+
+ Args:
+ scores (Mapping[T, float]): A mapping from labels to scores or probabilities.
+
+ Returns:
+ Set[T]: A set of labels extracted from the scores.
+ """
+
+
+@final
+class ProbabilisticLabelExtractor(LabelExtractor[T], Generic[T]):
+ """
+ Extracts the most probable label(s) from a set of raw class scores using score filtering
+ and probability normalization.
+
+ This extractor treats the input scores as representing a multiclass classification scenario,
+ where only one or a few mutually exclusive labels are expected to be true. It filters out
+ low-confidence scores, normalizes the remaining ones into a probability distribution, and
+ returns the top-k labels that meet a minimum probability threshold.
+
+ After normalization, scores are interpreted as probabilities—that is, each label's
+ value represents its relative likelihood among the remaining candidates.
+
+ Args:
+ k (int): The number of top labels to consider based on normalized probability.
+ score_threshold (float): Minimum raw score required to keep a label before normalization.
+ prob_threshold (float): Minimum normalized probability required for a label to be returned.
+
+ Returns:
+ Set[T]: A set of labels that pass both score and probability thresholds.
+
+ Notes:
+ - If only one label remains after score filtering, it will have a probability of 1.0
+ and will always be returned if `k >= 1`.
+ - When multiple labels remain, their probabilities may be lower, and some or all
+ may fall below the `prob_threshold`.
+ - This approach implicitly encodes a confidence mechanism: a label must be
+ both strong enough in raw score and relatively dominant in probability to be selected.
+ """
+
+ def __init__(
+ self,
+ k: int,
+ score_threshold: float,
+ prob_threshold: float,
+ ) -> None:
+
+ if not (0 <= score_threshold <= 1): # pylint: disable=superfluous-parens
+ raise ValueError("score_threshold must be between 0 and 1")
+ if not (0 <= prob_threshold <= 1): # pylint: disable=superfluous-parens
+ raise ValueError("prob_threshold must be between 0 and 1")
+ if k < 1:
+ raise ValueError("k must be at least 1")
+
+ self._score_threshold = score_threshold
+ self._prob_threshold = prob_threshold
+ self._k = k
+
+ def extract_labels(self, scores: Mapping[T, float]) -> Set[T]:
+ """
+ Applies filtering and probability-based selection to extract high-confidence labels.
+ """
+ filtered_scores = scores_cleanup(
+ scores, min_score=self._score_threshold, max_score=1.0
+ )
+
+ probabilities = scores_to_probabilities(filtered_scores)
+
+ if probabilities is None:
+ return set()
+
+ top_k = sorted(probabilities.items(), key=lambda item: item[1], reverse=True)[
+ : self._k
+ ]
+
+ return {label for label, prob in top_k if prob >= self._prob_threshold}
diff --git a/ingestion/src/metadata/pii/algorithms/scoring_ops.py b/ingestion/src/metadata/pii/algorithms/scoring_ops.py
new file mode 100644
index 00000000000..75f88349672
--- /dev/null
+++ b/ingestion/src/metadata/pii/algorithms/scoring_ops.py
@@ -0,0 +1,74 @@
+# Copyright 2025 Collate
+# Licensed under the Collate Community License, Version 1.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions for PII algorithms
+"""
+from collections import defaultdict
+from typing import Callable, DefaultDict, Mapping, Optional, TypeVar
+
+T = TypeVar("T")
+S = TypeVar("S")
+
+
+# Scores transformations
+
+
+def scores_cleanup(
+ scores: Mapping[T, float], min_score: float = 0.01, max_score: float = 1.0
+) -> Mapping[T, float]:
+ """
+ Clean the scores mapping by removing keys with scores below the minimum score.
+ Scores above the maximum score are capped to the maximum score.
+ """
+ if min_score > max_score:
+ raise ValueError(
+ f"Minimum score {min_score} cannot be greater than maximum score {max_score}."
+ )
+ return {
+ key: min(score, max_score)
+ for key, score in scores.items()
+ if score >= min_score
+ }
+
+
+def scores_group_by(
+ scores: Mapping[T, float], key_fn: Callable[[T], S]
+) -> Mapping[S, float]:
+ """
+ Group the scores by a key function.
+ The key function is applied to each key in `scores`,
+ and the scores are averaged for each group, thus maintaining
+ the score within the same range as the original one.
+ """
+ grouped: DefaultDict[S, float] = defaultdict(float)
+ counts: DefaultDict[S, int] = defaultdict(int)
+
+ # First, we count the occurrences of each key
+ for key, score in scores.items():
+ grouped[key_fn(key)] += score
+ counts[key_fn(key)] += 1
+
+ # Then, we average the scores by dividing by the count
+ for key in grouped:
+ grouped[key] /= counts[key]
+
+ return grouped
+
+
+def scores_to_probabilities(
+ scores: Mapping[T, float], tolerance: float = 0.001
+) -> Optional[Mapping[T, float]]:
+ total = sum(scores.values())
+
+ if total < tolerance:
+ return None
+
+ return {key: score / total for key, score in scores.items()}
diff --git a/ingestion/src/metadata/pii/algorithms/tags.py b/ingestion/src/metadata/pii/algorithms/tags.py
index 5281bfd31d5..1086cf1269f 100644
--- a/ingestion/src/metadata/pii/algorithms/tags.py
+++ b/ingestion/src/metadata/pii/algorithms/tags.py
@@ -16,15 +16,68 @@ import enum
from typing import List
+class PIIClassificationName(enum.Enum):
+ """
+ Classification name for PII related tags:
+ - PII: means is PIISensitive or PIINonSensitive.
+ - General: means PII Category (e.g., PERSON, EMAIL, etc.).
+ """
+
+ PII = "PII"
+ GENERAL = "General"
+
+
class PIISensitivityTag(enum.Enum):
SENSITIVE = "Sensitive"
NONSENSITIVE = "NonSensitive"
+ @classmethod
+ def pii_classification_name(cls) -> PIIClassificationName:
+ return PIIClassificationName.PII
+
+
+@enum.unique
+class PIICategoryTag(enum.Enum):
+ """
+ PII Category Tags.
+ These tags are used to categorize the PII tags into broader categories,
+ for instance, to show the PII tags in the UI.
+ """
+
+ PASSWORD = "Password"
+ BANK_NUMBER = "BankNumber"
+ PERSON = "Person"
+ BIRTH_DATE = "BirthDate"
+ GENDER = "Gender"
+ NRP = "NRP"
+ ADDRESS = "Address"
+ CREDIT_CARD = "CreditCardNumber"
+ CRYPTO = "Crypto"
+ DATE_TIME = "DateTime"
+ EMAIL_ADDRESS = "Email"
+ IBAN_CODE = "IBANCode"
+ IP_ADDRESS = "IPAddress"
+ LOCATION = "Location"
+ PHONE_NUMBER = "PhoneNumber"
+ MEDICAL_LICENSE = "MedicalLicense"
+ URL = "URL"
+ DRIVER_LICENSE = "DriverLicense"
+ NATIONAL_ID = "NationalID"
+ PASSPORT = "Passport"
+ VAT_CODE = "VATCode"
+
+ @classmethod
+ def pii_classification_name(cls) -> PIIClassificationName:
+ return PIIClassificationName.GENERAL
+
@enum.unique
class PIITag(enum.Enum):
"""
PII Tags (borrowed from Presidio https://microsoft.github.io/presidio/supported_entities/).
+ The values of these tags are valid Presidio entity names, changing them
+ will break the integration with Presidio.
+ A better name for this enum would have been `PresidioPII`.
"""
# Global
@@ -91,21 +144,3 @@ class PIITag(enum.Enum):
Get all the values of the enum as a set of strings.
"""
return [tag.value for tag in cls]
-
- def sensitivity(self) -> PIISensitivityTag:
- """
- Get the sensitivity level of the PII tag.
- This map is opinionated and can be changed in the future according to users' needs.
- """
- if self in DEFAULT_NON_PII_SENSITIVE:
- return PIISensitivityTag.NONSENSITIVE
- return PIISensitivityTag.SENSITIVE
-
-
-DEFAULT_NON_PII_SENSITIVE = (
- PIITag.DATE_TIME,
- PIITag.NRP,
- PIITag.LOCATION,
- PIITag.PHONE_NUMBER,
- PIITag.URL,
-)
diff --git a/ingestion/src/metadata/pii/algorithms/tags_ops.py b/ingestion/src/metadata/pii/algorithms/tags_ops.py
new file mode 100644
index 00000000000..ec220a315ac
--- /dev/null
+++ b/ingestion/src/metadata/pii/algorithms/tags_ops.py
@@ -0,0 +1,137 @@
+# Copyright 2025 Collate
+# Licensed under the Collate Community License, Version 1.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility operation on top of the tags defined in PII algorithms.
+"""
+from typing import Collection, Dict, Optional, Set
+
+from metadata.pii.algorithms.tags import PIICategoryTag, PIISensitivityTag, PIITag
+
+
+def categorize_pii_tag(pii_tag: PIITag) -> PIICategoryTag:
+ """
+ Categorize the PII tag into a broader category.
+ """
+ # return the category tag if the PII tag is in the category map
+ # the category map is defined separately for better readability
+ for category, tags in _CATEGORY_MAP.items():
+ if pii_tag in tags:
+ return category
+
+ # This should never happen, as we should have unit tests to ensure all PII tags are categorized.
+ raise ValueError(f"PII tag does not belong to any category: {pii_tag}")
+
+
+def get_sensitivity_for_pii_category(
+ pii_category_tag: PIICategoryTag,
+) -> PIISensitivityTag:
+ """
+ Get the sensitivity level of the PIICategoryTag.
+ This map is opinionated and can be changed in according to users' needs.
+ """
+ non_pii_sensitive = (
+ PIICategoryTag.GENDER,
+ PIICategoryTag.NRP,
+ PIICategoryTag.DATE_TIME,
+ PIICategoryTag.LOCATION,
+ PIICategoryTag.PHONE_NUMBER,
+ PIICategoryTag.URL,
+ )
+ if pii_category_tag in non_pii_sensitive:
+ return PIISensitivityTag.NONSENSITIVE
+ return PIISensitivityTag.SENSITIVE
+
+
+def resolve_sensitivity(
+ sensitivities: Collection[PIISensitivityTag],
+) -> Optional[PIISensitivityTag]:
+ """
+ Resolve the sensitivity level from a list of PIISensitivityTag.
+ Most restricted sensitivity is returned if multiple tags are present.
+ """
+ if not sensitivities:
+ return None
+ if PIISensitivityTag.SENSITIVE in sensitivities:
+ return PIISensitivityTag.SENSITIVE
+ return PIISensitivityTag.NONSENSITIVE
+
+
+def get_sensitivity_for_pii(pii_tag: PIITag) -> PIISensitivityTag:
+ """
+ Get the sensitivity level of the PIITag.
+ This map is opinionated and can be changed in the future according to users' needs.
+ """
+ pii_category_tag = categorize_pii_tag(pii_tag)
+ return get_sensitivity_for_pii_category(pii_category_tag)
+
+
+# Parent child aliases
+_P = PIICategoryTag
+_C = PIITag
+
+# Define the PIITag's a PIICategoryTag contains to map Presidio PII
+# tags to PIICategoryTag.
+_CATEGORY_MAP: Dict[PIICategoryTag, Set[PIITag]] = {
+ _P.PASSWORD: set(),
+ _P.BANK_NUMBER: {_C.US_BANK_NUMBER},
+ _P.CREDIT_CARD: {_C.CREDIT_CARD},
+ _P.PERSON: {_C.PERSON},
+ _P.GENDER: set(),
+ _P.NRP: {_C.NRP},
+ _P.ADDRESS: set(),
+ _P.CRYPTO: {_C.CRYPTO},
+ _P.DATE_TIME: {_C.DATE_TIME},
+ _P.EMAIL_ADDRESS: {_C.EMAIL_ADDRESS},
+ _P.IBAN_CODE: {_C.IBAN_CODE},
+ _P.IP_ADDRESS: {_C.IP_ADDRESS},
+ _P.LOCATION: {_C.LOCATION},
+ _P.PHONE_NUMBER: {_C.PHONE_NUMBER},
+ _P.MEDICAL_LICENSE: {_C.MEDICAL_LICENSE},
+ _P.URL: {_C.URL},
+ _P.DRIVER_LICENSE: {
+ _C.US_DRIVER_LICENSE,
+ _C.UK_NHS,
+ _C.IT_DRIVER_LICENSE,
+ },
+ _P.NATIONAL_ID: {
+ _C.US_ITIN,
+ _C.US_SSN,
+ _C.UK_NHS,
+ _C.ES_NIF,
+ _C.ES_NIE,
+ _C.IT_FISCAL_CODE,
+ _C.IT_PASSPORT,
+ _C.IT_IDENTITY_CARD,
+ _C.PL_PESEL,
+ _C.SG_NRIC_FIN,
+ _C.SG_UEN,
+ _C.AU_ABN,
+ _C.AU_ACN,
+ _C.AU_TFN,
+ _C.AU_MEDICARE,
+ _C.IN_PAN,
+ _C.IN_AADHAAR,
+ _C.IN_VEHICLE_REGISTRATION,
+ _C.IN_VOTER,
+ _C.FI_PERSONAL_IDENTITY_CODE,
+ },
+ _P.PASSPORT: {
+ _C.US_PASSPORT,
+ _C.IT_PASSPORT,
+ _C.IN_PASSPORT,
+ },
+ _P.VAT_CODE: {
+ _C.IT_VAT_CODE,
+ _C.AU_ABN,
+ _C.AU_ACN,
+ _C.AU_TFN,
+ },
+}
diff --git a/ingestion/src/metadata/pii/algorithms/utils.py b/ingestion/src/metadata/pii/algorithms/utils.py
deleted file mode 100644
index cf482336c9d..00000000000
--- a/ingestion/src/metadata/pii/algorithms/utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utility functions for PII algorithms
-"""
-from typing import Mapping, Sequence, TypeVar
-
-T = TypeVar("T")
-
-
-def normalize_scores(scores: Mapping[T, float], tol: float = 0.01) -> Mapping[T, float]:
- """
- Normalize the scores to sum to 1, while ignoring scores below the tolerance.
- Scores must be positive.
- """
- scores = {key: score for key, score in scores.items() if score > tol}
- total = sum(scores.values())
- if total == 0:
- return scores
- return {key: score / total for key, score in scores.items()}
-
-
-def get_top_classes(scores: Mapping[T, float], n: int, threshold: float) -> Sequence[T]:
- """
- Get the top n scores from the scores mapping that are above the threshold.
- The classes are sorted in descending order of their scores.
- """
- sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
- top_classes = [key for key, score in sorted_scores if score >= threshold]
- return top_classes[:n]
diff --git a/ingestion/src/metadata/pii/base_processor.py b/ingestion/src/metadata/pii/base_processor.py
index 0d94178b0e2..d001e8c4193 100644
--- a/ingestion/src/metadata/pii/base_processor.py
+++ b/ingestion/src/metadata/pii/base_processor.py
@@ -112,7 +112,6 @@ class AutoClassificationProcessor(Processor, ABC):
)
column_tags.append(column_tag)
except Exception as err:
- # TODO: Shouldn't we return a Left here?
self.status.failed(
StackTraceError(
name=record.table.fullyQualifiedName.root,
diff --git a/ingestion/src/metadata/pii/constants.py b/ingestion/src/metadata/pii/constants.py
index 3dd60dd85e3..fe54bbf924e 100644
--- a/ingestion/src/metadata/pii/constants.py
+++ b/ingestion/src/metadata/pii/constants.py
@@ -12,8 +12,6 @@
PII constants
"""
-PII = "PII"
-
# Constants for Presidio
PRESIDIO_LOGGER = "presidio-analyzer"
SPACY_EN_MODEL = "en_core_web_md"
diff --git a/ingestion/src/metadata/pii/models.py b/ingestion/src/metadata/pii/models.py
deleted file mode 100644
index 9a74cb4c366..00000000000
--- a/ingestion/src/metadata/pii/models.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-PII processing models
-"""
-from enum import Enum
-
-from pydantic import BaseModel
-
-
-class TagType(Enum):
- SENSITIVE = "Sensitive"
- NONSENSITIVE = "NonSensitive"
-
-
-class TagAndConfidence(BaseModel):
- tag_fqn: str
- confidence: float
diff --git a/ingestion/src/metadata/pii/ner.py b/ingestion/src/metadata/pii/ner.py
deleted file mode 100644
index 664286c27d9..00000000000
--- a/ingestion/src/metadata/pii/ner.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-NER Scanner based on Presidio.
-
-Supported Entities https://microsoft.github.io/presidio/supported_entities/
-"""
-from enum import Enum
-
-from metadata.pii.models import TagType
-from metadata.utils.logger import pii_logger
-
-logger = pii_logger()
-
-
-class NEREntity(Enum):
- """
- PII Entities supported by Presidio https://microsoft.github.io/presidio/supported_entities/
- """
-
- # Global
- CREDIT_CARD = TagType.SENSITIVE.value
- CRYPTO = TagType.SENSITIVE.value
- DATE_TIME = TagType.NONSENSITIVE.value
- EMAIL_ADDRESS = TagType.SENSITIVE.value
- IBAN_CODE = TagType.SENSITIVE.value
- IP_ADDRESS = TagType.SENSITIVE.value
- NRP = TagType.NONSENSITIVE.value
- LOCATION = TagType.NONSENSITIVE.value
- PERSON = TagType.SENSITIVE.value
- PHONE_NUMBER = TagType.NONSENSITIVE.value
- MEDICAL_LICENSE = TagType.SENSITIVE.value
- URL = TagType.NONSENSITIVE.value
-
- # USA
- US_BANK_NUMBER = TagType.SENSITIVE.value
- US_DRIVER_LICENSE = TagType.SENSITIVE.value
- US_ITIN = TagType.SENSITIVE.value
- US_PASSPORT = TagType.SENSITIVE.value
- US_SSN = TagType.SENSITIVE.value
-
- # UK
- UK_NHS = TagType.SENSITIVE.value
-
- # Spain
- ES_NIF = TagType.SENSITIVE.value
- ES_NIE = TagType.SENSITIVE.value
-
- # Italy
- IT_FISCAL_CODE = TagType.SENSITIVE.value
- IT_DRIVER_LICENSE = TagType.SENSITIVE.value
- IT_VAT_CODE = TagType.SENSITIVE.value
- IT_PASSPORT = TagType.SENSITIVE.value
- IT_IDENTITY_CARD = TagType.SENSITIVE.value
-
- # Poland
- PL_PESEL = TagType.SENSITIVE.value
-
- # Singapore
- SG_NRIC_FIN = TagType.SENSITIVE.value
- SG_UEN = TagType.SENSITIVE.value
-
- # Australia
- AU_ABN = TagType.SENSITIVE.value
- AU_ACN = TagType.SENSITIVE.value
- AU_TFN = TagType.SENSITIVE.value
- AU_MEDICARE = TagType.SENSITIVE.value
-
- # India
- IN_PAN = TagType.SENSITIVE.value
- IN_AADHAAR = TagType.SENSITIVE.value
- IN_VEHICLE_REGISTRATION = TagType.SENSITIVE.value
- IN_VOTER = TagType.SENSITIVE.value
- IN_PASSPORT = TagType.SENSITIVE.value
-
- # Finland
- FI_PERSONAL_IDENTITY_CODE = TagType.SENSITIVE.value
diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py
index 1bd4f5a6eb7..7c793989b6f 100644
--- a/ingestion/src/metadata/pii/processor.py
+++ b/ingestion/src/metadata/pii/processor.py
@@ -12,7 +12,7 @@
"""
Processor util to fetch pii sensitive columns
"""
-from typing import Any, Sequence
+from typing import Any, Sequence, TypeVar, Union
from metadata.generated.schema.entity.classification.tag import Tag
from metadata.generated.schema.entity.data.table import Column
@@ -26,13 +26,28 @@ from metadata.generated.schema.type.tagLabel import (
TagSource,
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
-from metadata.pii.algorithms.tags import PIISensitivityTag
-from metadata.pii.algorithms.utils import get_top_classes, normalize_scores
+from metadata.pii.algorithms.label_extractors import (
+ LabelExtractor,
+ ProbabilisticLabelExtractor,
+)
+from metadata.pii.algorithms.scoring_ops import scores_group_by
+from metadata.pii.algorithms.tags import (
+ PIICategoryTag,
+ PIIClassificationName,
+ PIISensitivityTag,
+ PIITag,
+)
+from metadata.pii.algorithms.tags_ops import (
+ categorize_pii_tag,
+ get_sensitivity_for_pii_category,
+ resolve_sensitivity,
+)
from metadata.pii.base_processor import AutoClassificationProcessor
-from metadata.pii.constants import PII
from metadata.utils import fqn
from metadata.utils.logger import profiler_logger
+T = TypeVar("T")
+
logger = profiler_logger()
@@ -48,33 +63,23 @@ class PIIProcessor(AutoClassificationProcessor):
):
super().__init__(config, metadata)
- from metadata.pii.algorithms.classifiers import ( # pylint: disable=import-outside-toplevel
- ColumnClassifier,
- PIISensitiveClassifier,
+ from metadata.pii.algorithms.column_labelers import ( # pylint: disable=import-outside-toplevel
+ ColumnLabeler,
+ HeuristicPIILabeler,
)
- self._classifier: ColumnClassifier[PIISensitivityTag] = PIISensitiveClassifier()
+ prob_threshold = self.source_config.confidence / 100
+ score_threshold = 0.1 # This is the minimum score to consider a tag
- self.confidence_threshold = self.source_config.confidence / 100
- self._tolerance = 0.01
-
- @staticmethod
- def build_tag_label(tag: PIISensitivityTag) -> TagLabel:
- tag_fqn = fqn.build(
- metadata=None,
- entity_type=Tag,
- classification_name=PII,
- tag_name=tag.value,
+ self._pii_category_extractor: LabelExtractor[
+ PIICategoryTag
+ ] = ProbabilisticLabelExtractor(
+ score_threshold=score_threshold,
+ prob_threshold=prob_threshold,
+ k=1, # k=1 means we return only the top category
)
- tag_label = TagLabel(
- tagFQN=tag_fqn,
- source=TagSource.Classification,
- state=State.Suggested,
- labelType=LabelType.Generated,
- )
-
- return tag_label
+ self._pii_labeler: ColumnLabeler[PIITag] = HeuristicPIILabeler()
def create_column_tag_labels(
self, column: Column, sample_data: Sequence[Any]
@@ -84,17 +89,55 @@ class PIIProcessor(AutoClassificationProcessor):
"""
# If the column we are about to process already has PII tags return empty
for tag in column.tags or []:
- if PII in tag.tagFQN.root:
+ if PIIClassificationName.PII.value in tag.tagFQN.root:
return []
- # Get the tags and confidence
- scores = self._classifier.predict_scores(
+ pii_tag_scores = self._pii_labeler.predict_scores(
sample_data, column_name=column.name.root, column_data_type=column.dataType
)
- scores = normalize_scores(scores, tol=self._tolerance)
+ pii_category_scores = scores_group_by(pii_tag_scores, categorize_pii_tag)
+
+ # We allow more than one category to be assigned, this might be useful
+ # for documents that contain multiple PII types.
+ # Whether, we want to return one or multiple labels is controlled
+ # by the LabelExtractor; to modify this behavior, please change the
+ # LabelExtractor used, and not the implementation of this method.
+
+ pii_categories = self._pii_category_extractor.extract_labels(
+ pii_category_scores
+ )
+
+ tag_labels = [get_tag_label(pii_category) for pii_category in pii_categories]
+
+ # Determine the sensitivity of the PII categories, if any
+ pii_sensitivity = resolve_sensitivity(
+ {get_sensitivity_for_pii_category(pc) for pc in pii_categories}
+ )
+
+ if pii_sensitivity:
+ tag_labels.append(get_tag_label(pii_sensitivity))
- # winner is at most 1 tag
- winner = get_top_classes(scores, 1, self.confidence_threshold)
- tag_labels = [self.build_tag_label(tag) for tag in winner]
return tag_labels
+
+
+def get_tag_label(tag: Union[PIICategoryTag, PIISensitivityTag]) -> TagLabel:
+
+ fqn_str = fqn.build(
+ None,
+ entity_type=Tag,
+ classification_name=tag.pii_classification_name().value,
+ tag_name=tag.value,
+ )
+
+ if fqn_str is None:
+ # This should be prevented by unit tests, but in case it happens,
+ # we prefer to fail noisily rather than silently returning None.
+ raise ValueError(f"Failed to build FQN for tag: {tag}")
+
+ return TagLabel(
+ tagFQN=fqn_str,
+ source=TagSource.Classification,
+ state=State.Suggested,
+ labelType=LabelType.Generated,
+ )
diff --git a/ingestion/src/metadata/pii/scanners/base.py b/ingestion/src/metadata/pii/scanners/base.py
deleted file mode 100644
index b068d2757b9..00000000000
--- a/ingestion/src/metadata/pii/scanners/base.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Basic Scanner ABC
-"""
-from abc import ABC, abstractmethod
-from typing import Any
-
-
-class BaseScanner(ABC):
- """Basic scanner abstract class"""
-
- @abstractmethod
- def scan(self, data: Any):
- """Scan the given data from a column"""
diff --git a/ingestion/src/metadata/pii/scanners/column_name_scanner.py b/ingestion/src/metadata/pii/scanners/column_name_scanner.py
deleted file mode 100644
index 732164c290b..00000000000
--- a/ingestion/src/metadata/pii/scanners/column_name_scanner.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Regex scanner for column names
-"""
-import re
-from typing import Optional
-
-from metadata.generated.schema.entity.classification.tag import Tag
-from metadata.pii.constants import PII
-from metadata.pii.models import TagAndConfidence, TagType
-from metadata.pii.scanners.base import BaseScanner
-from metadata.utils import fqn
-
-
-class ColumnNameScanner(BaseScanner):
- """Column Name Scanner to scan column name"""
-
- sensitive_regex = {
- "PASSWORD": re.compile("^.*password.*$", re.IGNORECASE),
- "US_SSN": re.compile("^.*(ssn|social).*$", re.IGNORECASE),
- "CREDIT_CARD": re.compile("^.*(credit).*(card).*$", re.IGNORECASE),
- "BANK_ACCOUNT": re.compile("^.*bank.*(acc|num).*$", re.IGNORECASE),
- "EMAIL_ADDRESS": re.compile("^(email|e-mail|mail)(.*address)?$", re.IGNORECASE),
- "USER_NAME": re.compile("^.*(user|client|person).*(name).*$", re.IGNORECASE),
- "PERSON": re.compile(
- "^.*(firstname|lastname|fullname|maidenname|nickname|name_suffix).*$",
- re.IGNORECASE,
- ),
- }
- non_sensitive_regex = {
- "BIRTH_DATE": re.compile(
- "^.*(date_of_birth|dateofbirth|dob|"
- "birthday|date_of_death|dateofdeath).*$",
- re.IGNORECASE,
- ),
- "GENDER": re.compile("^.*(gender).*$", re.IGNORECASE),
- "NATIONALITY": re.compile("^.*(nationality).*$", re.IGNORECASE),
- "ADDRESS": re.compile(
- "^.*(address|city|state|county|country|"
- "zipcode|zip|postal|zone|borough).*$",
- re.IGNORECASE,
- ),
- "PHONE_NUMBER": re.compile("^.*(phone).*$", re.IGNORECASE),
- }
-
- def scan(self, data: str) -> Optional[TagAndConfidence]:
- """
- Check the column name against the regex patterns and prepare the
- sensitive or non-sensitive tag
- """
- for pii_type_pattern in self.sensitive_regex.values():
- if pii_type_pattern.match(data) is not None:
- return TagAndConfidence(
- tag_fqn=fqn.build(
- metadata=None,
- entity_type=Tag,
- classification_name=PII,
- tag_name=TagType.SENSITIVE.value,
- ),
- confidence=1,
- )
-
- for pii_type_pattern in self.non_sensitive_regex.values():
- if pii_type_pattern.match(data) is not None:
- return TagAndConfidence(
- tag_fqn=fqn.build(
- metadata=None,
- entity_type=Tag,
- classification_name=PII,
- tag_name=TagType.NONSENSITIVE.value,
- ),
- confidence=1,
- )
-
- return None
diff --git a/ingestion/src/metadata/pii/scanners/ner_scanner.py b/ingestion/src/metadata/pii/scanners/ner_scanner.py
deleted file mode 100644
index 58e0ef84b32..00000000000
--- a/ingestion/src/metadata/pii/scanners/ner_scanner.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-NER Scanner based on Presidio.
-
-Supported Entities https://microsoft.github.io/presidio/supported_entities/
-"""
-import json
-import logging
-import traceback
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from pydantic import BaseModel, ConfigDict
-
-from metadata.generated.schema.entity.classification.tag import Tag
-from metadata.pii.constants import PII, SPACY_EN_MODEL
-from metadata.pii.models import TagAndConfidence
-from metadata.pii.ner import NEREntity
-from metadata.pii.scanners.base import BaseScanner
-from metadata.utils import fqn
-from metadata.utils.logger import METADATA_LOGGER, pii_logger
-
-logger = pii_logger()
-SUPPORTED_LANG = "en"
-PRESIDIO_LOGGER = "presidio-analyzer"
-
-
-class StringAnalysis(BaseModel):
- """
- Used to store results from the sample data scans for each NER Entity
- """
-
- score: float
- appearances: int
-
-
-class NLPEngineModel(BaseModel):
- """Required to pass the nlp_engine as {"lang_code": "en", "model_name": "en_core_web_lg"}"""
-
- model_config = ConfigDict(protected_namespaces=())
- lang_code: str
- model_name: str
-
-
-# pylint: disable=import-outside-toplevel
-class NERScanner(BaseScanner):
- """Based on https://microsoft.github.io/presidio/"""
-
- def __init__(self):
- import spacy
- from presidio_analyzer import AnalyzerEngine
- from presidio_analyzer.nlp_engine.spacy_nlp_engine import SpacyNlpEngine
-
- try:
- spacy.load(SPACY_EN_MODEL)
- except OSError:
- logger.warning("Downloading en_core_web_md language model for the spaCy")
- from spacy.cli import download
-
- download(SPACY_EN_MODEL)
- spacy.load(SPACY_EN_MODEL)
-
- nlp_engine_model = NLPEngineModel(
- lang_code=SUPPORTED_LANG, model_name=SPACY_EN_MODEL
- )
-
- # Set the presidio logger to talk less about internal entities unless we are debugging
- logging.getLogger(PRESIDIO_LOGGER).setLevel(
- logging.INFO
- if logging.getLogger(METADATA_LOGGER).level == logging.DEBUG
- else logging.ERROR
- )
-
- self.analyzer = AnalyzerEngine(
- nlp_engine=SpacyNlpEngine(models=[nlp_engine_model.model_dump()])
- )
-
- @staticmethod
- def get_highest_score_label(
- entities_score: Dict[str, StringAnalysis]
- ) -> Tuple[str, float]:
- top_entity = max(
- entities_score,
- key=lambda type_: entities_score[type_].score
- * entities_score[type_].appearances
- * 0.8,
- )
- return top_entity, entities_score[top_entity].score
-
- def scan(self, data: List[Any]) -> Optional[TagAndConfidence]:
- """
- Scan the column's sample data rows and look for PII.
-
- How this works:
- 1. We create a list of strings [s1, s2, ..., sn] with each sample data row for a column
- 2. Then, for each s_i:
- a. Run the analyzer, which will return a list of possible recognized Entities and confidence score
- For example, the result of analyzing `123456789` gives us
- [
- type: DATE_TIME, start: 0, end: 9, score: 0.85,
- type: US_BANK_NUMBER, start: 0, end: 9, score: 0.05,
- type: US_PASSPORT, start: 0, end: 9, score: 0.05,
- type: US_DRIVER_LICENSE, start: 0, end: 9, score: 0.01
- ]
- b. Each time an `Entity` appears (e.g., DATE_TIME), we store its max score and the number of appearances
- 3. After gathering all the results for each row, get the `Entity` with maximum overall score
- and number of appearances. This gets computed as "score * appearances * 0.8", which can
- be thought as the "score" times "weighted down appearances".
- 4. Once we have the "top" `Entity` from that column, we assign the PII label accordingly from `NEREntity`.
- """
- logger.debug("Processing '%s'", data)
-
- # Initialize an empty dict for the given row list
- entities_score: Dict[str, StringAnalysis] = defaultdict(
- lambda: StringAnalysis(score=0, appearances=0)
- )
-
- str_sample_data_rows = [str(row) for row in data if row is not None]
- for row in str_sample_data_rows:
- try:
- self.process_data(row=row, entities_score=entities_score)
- except Exception as exc:
- logger.warning(f"Unknown error while processing {row} - {exc}")
- logger.debug(traceback.format_exc())
-
- if entities_score:
- label, score = self.get_highest_score_label(entities_score)
- tag_type = NEREntity.__members__.get(label)
- if not tag_type:
- return None
- return TagAndConfidence(
- tag_fqn=fqn.build(
- metadata=None,
- entity_type=Tag,
- classification_name=PII,
- tag_name=tag_type.value,
- ),
- confidence=score,
- )
-
- return None
-
- def process_data(self, row: str, entities_score: Dict[str, StringAnalysis]) -> None:
- """Process the Sample Data rows, checking if they are of JSON format as well"""
- # first, check if the data is JSON or we can work with strings
- is_json, value = self.is_json_data(row)
- if is_json and isinstance(value, dict):
- for val in value.values():
- self.process_data(row=str(val), entities_score=entities_score)
- elif is_json and isinstance(value, list):
- for val in value:
- self.process_data(row=str(val), entities_score=entities_score)
- else:
- self.scan_value(value=row, entities_score=entities_score)
-
- @staticmethod
- def is_json_data(value: str) -> Tuple[bool, Union[dict, list, None]]:
- """Check if the value is a JSON object that we need to process differently than strings"""
- try:
- res = json.loads(value)
- if isinstance(res, (dict, list)):
- return True, res
- return False, None
- except json.JSONDecodeError:
- return False, None
-
- def scan_value(self, value: str, entities_score: Dict[str, StringAnalysis]):
- """Scan the value for PII"""
- results = self.analyzer.analyze(value, language="en")
- for result in results:
- entities_score[result.entity_type] = StringAnalysis(
- score=result.score
- if result.score > entities_score[result.entity_type].score
- else entities_score[result.entity_type].score,
- appearances=entities_score[result.entity_type].appearances + 1,
- )
diff --git a/ingestion/tests/__init__.py b/ingestion/tests/__init__.py
index e69de29bb2d..129e753275c 100644
--- a/ingestion/tests/__init__.py
+++ b/ingestion/tests/__init__.py
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+TESTS_ROOT_DIR = Path(__file__).parent
+INGESTION_ROOT_DIR = TESTS_ROOT_DIR.parent
+REPO_ROOT_DIR = INGESTION_ROOT_DIR.parent
diff --git a/ingestion/tests/integration/pii/__init__.py b/ingestion/tests/integration/pii/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/ingestion/tests/integration/orm_profiler/test_pii_processor.py b/ingestion/tests/integration/pii/test_pii_processor.py
similarity index 92%
rename from ingestion/tests/integration/orm_profiler/test_pii_processor.py
rename to ingestion/tests/integration/pii/test_pii_processor.py
index 76c03c37f01..e16033b9fb2 100644
--- a/ingestion/tests/integration/orm_profiler/test_pii_processor.py
+++ b/ingestion/tests/integration/pii/test_pii_processor.py
@@ -113,6 +113,15 @@ table_data = TableData(
EXPECTED_COLUMN_TAGS = [
+ ColumnTag(
+ column_fqn="test-service-table-patch.test-db.test-schema.customers.first_name",
+ tag_label=TagLabel(
+ tagFQN=TagFQN("General.Person"),
+ source="Classification",
+ labelType="Automated",
+ state="Suggested",
+ ),
+ ),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_name",
tag_label=TagLabel(
@@ -122,6 +131,15 @@ EXPECTED_COLUMN_TAGS = [
state="Suggested",
),
),
+ ColumnTag(
+ column_fqn="test-service-table-patch.test-db.test-schema.customers.first_order",
+ tag_label=TagLabel(
+ tagFQN=TagFQN("General.DateTime"),
+ source="Classification",
+ labelType="Automated",
+ state="Suggested",
+ ),
+ ),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.first_order",
tag_label=TagLabel(
@@ -131,6 +149,15 @@ EXPECTED_COLUMN_TAGS = [
state="Suggested",
),
),
+ ColumnTag(
+ column_fqn="test-service-table-patch.test-db.test-schema.customers.random",
+ tag_label=TagLabel(
+ tagFQN=TagFQN("General.Email"),
+ source="Classification",
+ labelType="Automated",
+ state="Suggested",
+ ),
+ ),
ColumnTag(
column_fqn="test-service-table-patch.test-db.test-schema.customers.random",
tag_label=TagLabel(
diff --git a/ingestion/tests/unit/pii/algorithms/test_classifiers.py b/ingestion/tests/unit/pii/algorithms/test_classifiers.py
index 73ebda39a61..2bf923afd2c 100644
--- a/ingestion/tests/unit/pii/algorithms/test_classifiers.py
+++ b/ingestion/tests/unit/pii/algorithms/test_classifiers.py
@@ -11,7 +11,7 @@
import inspect
from typing import Iterable, Tuple
-from metadata.pii.algorithms.classifiers import ColumnClassifier, HeuristicPIIClassifier
+from metadata.pii.algorithms.column_labelers import ColumnLabeler, HeuristicPIILabeler
from metadata.pii.algorithms.tags import PIITag
from .data import pii_samples
@@ -27,7 +27,7 @@ def get_sample_data() -> Iterable[Tuple[str, LabeledData]]:
yield name, obj
-def run_test_on_pii_classifier(pii_classifier: ColumnClassifier[PIITag]) -> str:
+def run_test_on_pii_classifier(pii_classifier: ColumnLabeler[PIITag]) -> str:
"""Apply the classifier to the data and check the results"""
tested_datasets = 0
@@ -49,6 +49,6 @@ def run_test_on_pii_classifier(pii_classifier: ColumnClassifier[PIITag]) -> str:
def test_pii_heuristic_classifier(pii_test_logger):
"""Test the PII heuristic classifier"""
- heuristic_classifier = HeuristicPIIClassifier()
+ heuristic_classifier = HeuristicPIILabeler()
results = run_test_on_pii_classifier(heuristic_classifier)
pii_test_logger.info(results)
diff --git a/ingestion/tests/unit/pii/algorithms/test_label_extractors.py b/ingestion/tests/unit/pii/algorithms/test_label_extractors.py
new file mode 100644
index 00000000000..c0b3006821f
--- /dev/null
+++ b/ingestion/tests/unit/pii/algorithms/test_label_extractors.py
@@ -0,0 +1,45 @@
+from metadata.pii.algorithms.label_extractors import ProbabilisticLabelExtractor
+
+
+def test_extract_labels_single_dominant_label():
+ extractor = ProbabilisticLabelExtractor(
+ score_threshold=0.2, prob_threshold=0.8, k=1
+ )
+
+ scores = {
+ "LabelA": 0.9,
+ "LabelB": 0.1,
+ }
+
+ # After filtering: {"LabelA": 0.9}
+ # Normalized: LabelA = 1.0
+ # Passes probability threshold
+ assert extractor.extract_labels(scores) == {"LabelA"}
+
+
+def test_extract_labels_top_label_below_probability_threshold():
+ extractor = ProbabilisticLabelExtractor(
+ score_threshold=0.1, prob_threshold=0.9, k=1
+ )
+
+ scores = {
+ "LabelA": 0.5,
+ "LabelB": 0.4,
+ }
+
+ # Normalized: A ≈ 0.56, B ≈ 0.44 → neither meets prob_threshold=0.9
+ assert extractor.extract_labels(scores) == set()
+
+
+def test_extract_labels_equal_scores_all_pass():
+ extractor = ProbabilisticLabelExtractor(
+ score_threshold=0.1, prob_threshold=0.3, k=2
+ )
+
+ scores = {
+ "LabelA": 0.4,
+ "LabelB": 0.4,
+ }
+
+ # Normalized: each = 0.5 → both ≥ prob_threshold
+ assert extractor.extract_labels(scores) == {"LabelA", "LabelB"}
diff --git a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
index 51b66131f66..2ba5bae3d4e 100644
--- a/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
+++ b/ingestion/tests/unit/pii/algorithms/test_presidio_utils.py
@@ -13,7 +13,7 @@ from metadata.pii.algorithms.presidio_utils import (
set_presidio_logger_level,
)
from metadata.pii.algorithms.tags import PIITag
-from metadata.pii.scanners.ner_scanner import SUPPORTED_LANG
+from metadata.pii.constants import SUPPORTED_LANG
def test_analyzer_supports_all_expected_pii_entities():
diff --git a/ingestion/tests/unit/pii/algorithms/test_tags.py b/ingestion/tests/unit/pii/algorithms/test_tags.py
new file mode 100644
index 00000000000..06916936c13
--- /dev/null
+++ b/ingestion/tests/unit/pii/algorithms/test_tags.py
@@ -0,0 +1,22 @@
+import json
+
+from metadata.pii.algorithms.tags import PIICategoryTag
+
+from .... import REPO_ROOT_DIR
+
+
+def test_pii_categories_agree_with_openmetadata_ner_entities() -> None:
+ """
+ Test that the PII categories agree with the OpenMetadata service
+ """
+ path = (
+ REPO_ROOT_DIR
+ / "openmetadata-service/src/main/resources/json/data/tags/NEREntityGeneralTags.json"
+ )
+ with open(path, "r") as file:
+ data = json.load(file)
+ tag_labels = {create_tag["name"] for create_tag in data["createTags"]}
+ pii_category_tag = {pii_cat_tag.value for pii_cat_tag in PIICategoryTag}
+ assert (
+ pii_category_tag == tag_labels
+ ), f"PII Category Tags {pii_category_tag} do not match OpenMetadata NEREntityGeneralTags {tag_labels}"
diff --git a/ingestion/tests/unit/pii/algorithms/test_tags_ops.py b/ingestion/tests/unit/pii/algorithms/test_tags_ops.py
new file mode 100644
index 00000000000..61b540c414a
--- /dev/null
+++ b/ingestion/tests/unit/pii/algorithms/test_tags_ops.py
@@ -0,0 +1,39 @@
+import pytest
+
+from metadata.pii.algorithms.tags import PIISensitivityTag, PIITag
+from metadata.pii.algorithms.tags_ops import categorize_pii_tag, resolve_sensitivity
+
+
+def test_each_pii_tag_is_mapped_to_a_pii_tag_category():
+ """
+ Test that each PII tag is mapped to a PII tag category.
+ """
+ for tag in PIITag:
+ try:
+ _ = categorize_pii_tag(tag)
+ except ValueError:
+ raise AssertionError(f"PII tag {tag} is not mapped to a category.")
+
+
+@pytest.mark.parametrize(
+ "input_tags,expected",
+ [
+ ([], None),
+ ([PIISensitivityTag.NONSENSITIVE], PIISensitivityTag.NONSENSITIVE),
+ ([PIISensitivityTag.SENSITIVE], PIISensitivityTag.SENSITIVE),
+ (
+ [PIISensitivityTag.NONSENSITIVE, PIISensitivityTag.NONSENSITIVE],
+ PIISensitivityTag.NONSENSITIVE,
+ ),
+ (
+ [PIISensitivityTag.NONSENSITIVE, PIISensitivityTag.SENSITIVE],
+ PIISensitivityTag.SENSITIVE,
+ ),
+ (
+ [PIISensitivityTag.SENSITIVE, PIISensitivityTag.SENSITIVE],
+ PIISensitivityTag.SENSITIVE,
+ ),
+ ],
+)
+def test_resolve_sensitivity(input_tags, expected):
+ assert resolve_sensitivity(input_tags) == expected
diff --git a/ingestion/tests/unit/pii/test_column_name_scanner.py b/ingestion/tests/unit/pii/test_column_name_scanner.py
deleted file mode 100644
index 8e08b60dc69..00000000000
--- a/ingestion/tests/unit/pii/test_column_name_scanner.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Test Column Name Scanner
-"""
-import pytest
-
-from metadata.pii.models import TagAndConfidence
-from metadata.pii.scanners.column_name_scanner import ColumnNameScanner
-
-EXPECTED_SENSITIVE = TagAndConfidence(
- tag_fqn="PII.Sensitive",
- confidence=1,
-)
-
-
-@pytest.fixture
-def scanner() -> ColumnNameScanner:
- """Return the scanner"""
- return ColumnNameScanner()
-
-
-def test_column_names_none(scanner):
- assert scanner.scan("access_channel") is None
- assert scanner.scan("status_reason") is None
-
- # Credit Card
- assert scanner.scan("credit") is None
- assert scanner.scan("user_credits") is None
-
- # Users
- assert scanner.scan("id") is None
- assert scanner.scan("user_id") is None
-
- # Mails
- assert scanner.scan("email_verified") is None
-
-
-def test_column_names_sensitive(scanner):
- # Bank
- assert scanner.scan("bank_account") == EXPECTED_SENSITIVE
-
- # Credit Card
- assert scanner.scan("credit_card") == EXPECTED_SENSITIVE
- assert scanner.scan("credit_card_number") == EXPECTED_SENSITIVE
- assert scanner.scan("personal_credit_card") == EXPECTED_SENSITIVE
-
- # Users
- assert scanner.scan("user_name") == EXPECTED_SENSITIVE
- assert scanner.scan("user_first_name") == EXPECTED_SENSITIVE
- assert scanner.scan("user_last_name") == EXPECTED_SENSITIVE
- assert scanner.scan("client_name") == EXPECTED_SENSITIVE
- assert scanner.scan("person_first_name") == EXPECTED_SENSITIVE
- assert scanner.scan("client_last_name") == EXPECTED_SENSITIVE
-
- assert scanner.scan("email") == EXPECTED_SENSITIVE
- assert scanner.scan("email_address") == EXPECTED_SENSITIVE
- assert scanner.scan("ssn") == EXPECTED_SENSITIVE
diff --git a/ingestion/tests/unit/pii/test_ner_scanner.py b/ingestion/tests/unit/pii/test_ner_scanner.py
deleted file mode 100644
index c4ee50a55b7..00000000000
--- a/ingestion/tests/unit/pii/test_ner_scanner.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Test Column Name Scanner
-"""
-from typing import Any
-
-import pytest
-
-from metadata.pii.scanners.ner_scanner import NERScanner, StringAnalysis
-
-
-@pytest.fixture
-def scanner() -> NERScanner:
- """Return the scanner"""
- return NERScanner()
-
-
-def test_scanner_none(scanner):
- assert scanner.scan(list(range(100))) is None
- assert (
- scanner.scan(
- " ".split(
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
- )
- )
- ) is None
-
-
-def test_scanner_sensitive(scanner):
- assert (
- scanner.scan(
- [
- "geraldc@gmail.com",
- "saratimithi@godesign.com",
- "heroldsean@google.com",
- ]
- ).tag_fqn
- == "PII.Sensitive"
- )
- assert (
- scanner.scan(["im ok", "saratimithi@godesign.com", "not sensitive"]).tag_fqn
- == "PII.Sensitive"
- )
-
-
-def test_scanner_nonsensitive(scanner):
- assert (
- scanner.scan(
- [
- "Washington",
- "Alaska",
- "Netherfield Lea Street",
- ]
- ).tag_fqn
- == "PII.NonSensitive"
- )
-
-
-def test_get_highest_score_label(scanner):
- """Validate that even with score clashes, we only get one result back"""
- assert scanner.get_highest_score_label(
- {
- "PII.Sensitive": StringAnalysis(score=0.9, appearances=1),
- "PII.NonSensitive": StringAnalysis(score=0.8, appearances=1),
- }
- ) == ("PII.Sensitive", 0.9)
- assert scanner.get_highest_score_label(
- {
- "PII.Sensitive": StringAnalysis(score=1.0, appearances=1),
- "PII.NonSensitive": StringAnalysis(score=1.0, appearances=1),
- }
- ) == ("PII.Sensitive", 1.0)
-
-
-@pytest.mark.parametrize(
- "data,is_json",
- [
- ("potato", (False, None)),
- ("1", (False, None)),
- ('{"key": "value"}', (True, {"key": "value"})),
- (
- '{"key": "value", "key2": "value2"}',
- (True, {"key": "value", "key2": "value2"}),
- ),
- ('["potato"]', (True, ["potato"])),
- ],
-)
-def test_is_json_data(scanner, data: Any, is_json: bool):
- """Assert we are flagging JSON data correctly"""
- assert scanner.is_json_data(data) == is_json
-
-
-def test_scanner_with_json(scanner):
- """Test the scanner with JSON data"""
-
- assert (
- scanner.scan(
- [
- '{"email": "johndoe@example.com", "address": {"street": "123 Main St"}}',
- '{"email": "potato", "age": 30, "preferences": {"newsletter": true, "notifications": "email"}}',
- ]
- ).tag_fqn
- == "PII.Sensitive"
- )
-
- assert (
- scanner.scan(
- [
- '{"email": "foo", "address": {"street": "bar"}}',
- '{"email": "potato", "age": 30, "preferences": {"newsletter": true, "notifications": "email"}}',
- ]
- )
- is None
- )
-
-
-def test_scanner_with_lists(scanner):
- """Test the scanner with list data"""
-
- assert scanner.scan(["foo", "bar", "biz"]) is None
-
- assert (
- scanner.scan(["foo", "bar", "johndoe@example.com"]).tag_fqn == "PII.Sensitive"
- )
-
- assert (
- scanner.scan(
- [
- '{"emails": ["johndoe@example.com", "lima@example.com"]}',
- '{"emails": ["foo", "bar", "biz"]}',
- ]
- ).tag_fqn
- == "PII.Sensitive"
- )
-
-
-def test_scan_entities(scanner):
- """
- We can properly validate certain entities.
-
- > NOTE: These lists are randomly generated and not valid IDs for any actual use
- """
- pan_numbers = ["AFZPK7190K", "BLQSM2938L", "CWRTJ5821M", "DZXNV9045A", "EHYKG6752P"]
- assert scanner.scan(pan_numbers).tag_fqn == "PII.Sensitive"
-
- ssn_numbers = [
- "123-45-6789",
- "987-65-4321",
- "543-21-0987",
- "678-90-1234",
- "876-54-3210",
- ]
- assert scanner.scan(ssn_numbers).tag_fqn == "PII.Sensitive"
-
- nif_numbers = ["12345678A", "87654321B", "23456789C", "98765432D", "34567890E"]
- assert scanner.scan(nif_numbers).tag_fqn == "PII.Sensitive"
diff --git a/ingestion/tests/unit/pii/test_pii_sensitive.py b/ingestion/tests/unit/pii/test_pii_sensitive.py
deleted file mode 100644
index dd532f4e93b..00000000000
--- a/ingestion/tests/unit/pii/test_pii_sensitive.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2025 Collate
-# Licensed under the Collate Community License, Version 1.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from metadata.generated.schema.type.tagLabel import LabelType, State, TagSource
-from metadata.pii.algorithms.tags import PIISensitivityTag
-from metadata.pii.processor import PIIProcessor
-
-
-def test_pii_processor_build_tag_label_for_pii_sensitive():
-
- tag = PIISensitivityTag.SENSITIVE
- tag_label = PIIProcessor.build_tag_label(tag)
-
- assert tag_label.tagFQN.root == "PII.Sensitive"
- assert tag_label.source == TagSource.Classification
- assert tag_label.state == State.Suggested
- assert tag_label.labelType == LabelType.Generated
-
-
-def test_pii_processor_build_tag_label_for_pii_nonsensitive():
- tag = PIISensitivityTag.NONSENSITIVE
- tag_label = PIIProcessor.build_tag_label(tag)
-
- assert tag_label.tagFQN.root == "PII.NonSensitive"
- assert tag_label.source == TagSource.Classification
- assert tag_label.state == State.Suggested
- assert tag_label.labelType == LabelType.Generated
diff --git a/ingestion/tests/unit/pii/test_processor.py b/ingestion/tests/unit/pii/test_processor.py
new file mode 100644
index 00000000000..73f931492c2
--- /dev/null
+++ b/ingestion/tests/unit/pii/test_processor.py
@@ -0,0 +1,33 @@
+from metadata.generated.schema.type.tagLabel import LabelType, TagSource
+from metadata.pii.algorithms.tags import PIICategoryTag, PIISensitivityTag
+from metadata.pii.processor import get_tag_label
+
+
+def test_get_general_tag_label_from_pii_tag_category():
+ """
+ Test that the general tag FQN from a tag category never fails.
+ """
+ for tag in PIICategoryTag:
+ try:
+ tag_label = get_tag_label(tag)
+ assert tag_label.tagFQN.root == f"General.{tag.value}"
+ assert tag_label.source == TagSource.Classification
+ assert tag_label.labelType == LabelType.Generated
+ except ValueError:
+ raise AssertionError(f"Failed to get general tag FQN for tag {tag}.")
+
+
+def test_get_general_tag_label_from_pii_sensitivity():
+ """
+ Test that the general tag FQN from a PII sensitivity never fails.
+ """
+ for tag in PIISensitivityTag:
+ try:
+ tag_label = get_tag_label(tag)
+ assert tag_label.tagFQN.root == f"PII.{tag.value}"
+ assert tag_label.source == TagSource.Classification
+ assert tag_label.labelType == LabelType.Generated
+ except ValueError:
+ raise AssertionError(
+ f"Failed to get general tag FQN for sensitivity {tag}."
+ )
diff --git a/openmetadata-service/src/main/resources/json/data/tags/NEREntityGeneralTags.json b/openmetadata-service/src/main/resources/json/data/tags/NEREntityGeneralTags.json
new file mode 100644
index 00000000000..11cc04ecf0e
--- /dev/null
+++ b/openmetadata-service/src/main/resources/json/data/tags/NEREntityGeneralTags.json
@@ -0,0 +1,94 @@
+{
+ "createClassification": {
+ "name": "General",
+ "description": "Category describing generic data types, such as `DateTime`, `Location`, or `BankNumber`",
+ "provider": "system",
+ "mutuallyExclusive": "false"
+ },
+ "createTags": [
+ {
+ "name": "DateTime",
+ "description": "Absolute or relative dates or periods or times smaller than a day."
+ },
+ {
+ "name": "Password",
+ "description": "Field holding password information."
+ },
+ {
+ "name": "BirthDate",
+ "description": "Person's birth date."
+ },
+ {
+ "name": "Gender",
+ "description": "Person's gender."
+ },
+ {
+ "name": "Location",
+ "description": "Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains."
+ },
+ {
+ "name": "Address",
+ "description": "Address of a Person."
+ },
+ {
+ "name": "MedicalLicense",
+ "description": "Common medical license numbers."
+ },
+ {
+ "name": "URL",
+ "description": "A URL (Uniform Resource Locator), unique identifier used to locate a resource on the Internet."
+ },
+ {
+ "name": "IBANCode",
+ "description": "The International Bank Account Number (IBAN) is an internationally agreed system of identifying bank accounts across national borders to facilitate the communication and processing of cross border transactions with a reduced risk of transcription errors."
+ },
+ {
+ "name": "BankNumber",
+ "description": "Bank Account Number."
+ },
+ {
+ "name": "CreditCardNumber",
+ "description": "Credit card number of the user."
+ },
+ {
+ "name": "Crypto",
+ "description": "Crypto Wallet Number."
+ },
+ {
+ "name": "DriverLicense",
+ "description": "Person's driver's license image or number."
+ },
+ {
+ "name": "Email",
+ "description": "Email address."
+ },
+ {
+ "name": "IPAddress",
+ "description": "An Internet Protocol (IP) address (either IPv4 or IPv6)."
+ },
+ {
+ "name": "NRP",
+ "description": "A person’s Nationality, religious or political group."
+ },
+ {
+ "name": "Person",
+ "description": "A full person name, which can include first names, middle names or initials, and last names."
+ },
+ {
+ "name": "PhoneNumber",
+ "description": "A telephone number."
+ },
+ {
+ "name": "VATCode",
+ "description": "VAT code or identification number. See [VAT Identification Number](https://en.wikipedia.org/wiki/VAT_identification_number)."
+ },
+ {
+ "name": "NationalID",
+ "description": "Region specific identifiers, such as [ABN](https://en.wikipedia.org/wiki/Australian_Business_Number), [NIF](https://es.wikipedia.org/wiki/N%C3%BAmero_de_identificaci%C3%B3n_fiscal) or [NHS](https://en.wikipedia.org/wiki/National_Health_Service)."
+ },
+ {
+ "name": "Passport",
+ "description": "National passport numbers, such as US or IT passports."
+ }
+ ]
+}