improvement in pii tagging (#10696)

* improvement in pii tagging * fix conflict and changes as per comment * Added confidence field * changes as per comments * Apply suggestions from code review Co-authored-by: Teddy <teddy.crepineau@gmail.com> --------- Co-authored-by: Ashish Gupta <ashish@getcollate.io> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com> Co-authored-by: Teddy <teddy.crepineau@gmail.com>
2025-12-13 08:37:03 +00:00 · 2023-03-28 19:37:48 +05:30 · 2023-03-28 19:37:48 +05:30 · 46afe69811
commit 46afe69811
parent 4853dfc6c8
12 changed files with 169 additions and 36 deletions
--- a/ingestion/src/metadata/ingestion/processor/pii.py
+++ b/ingestion/src/metadata/ingestion/processor/pii.py
@ -13,8 +13,9 @@
 Processor util to fetch pii sensitive columns
 """
 import logging
+import re
 import traceback
-from enum import Enum
+from enum import Enum, auto
 from typing import Optional, Tuple

 from metadata.generated.schema.entity.classification.tag import Tag
@ -25,11 +26,73 @@ from metadata.utils import fqn
 PII = "PII"


+class PiiTypes(Enum):
+    """PiiTypes enumerates the different types of PII data"""
+
+    NONE = auto()
+    UNSUPPORTED = auto()
+    PHONE = auto()
+    EMAIL = auto()
+    CREDIT_CARD = auto()
+    ADDRESS = auto()
+    ADDRESS_LOCATION = auto()
+    PERSON = auto()
+    LOCATION = auto()
+    BIRTH_DATE = auto()
+    GENDER = auto()
+    NATIONALITY = auto()
+    IP_ADDRESS = auto()
+    SSN = auto()
+    USER_NAME = auto()
+    PASSWORD = auto()
+    ETHNICITY = auto()
+    TAX_ID = auto()
+    KEY = auto()
+    BANKACC = auto()
+
+
 class TagType(Enum):
    SENSITIVE = "Sensitive"
    NONSENSITIVE = "NonSensitive"


+class ColumnNameScanner:
+    """
+    Column Name Scanner to scan column name
+    """
+
+    sensitive_regex = {
+        PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
+        PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE),
+        PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE),
+        PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
+        PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE),
+        PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE),
+        PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
+    }
+    non_sensitive_regex = {
+        PiiTypes.PERSON: re.compile(
+            "^.*(firstname|fname|lastname|lname|"
+            "fullname|maidenname|_name|"
+            "nickname|name_suffix|name).*$",
+            re.IGNORECASE,
+        ),
+        PiiTypes.BIRTH_DATE: re.compile(
+            "^.*(date_of_birth|dateofbirth|dob|"
+            "birthday|date_of_death|dateofdeath).*$",
+            re.IGNORECASE,
+        ),
+        PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
+        PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
+        PiiTypes.ADDRESS: re.compile(
+            "^.*(address|city|state|county|country|"
+            "zipcode|zip|postal|zone|borough).*$",
+            re.IGNORECASE,
+        ),
+        PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
+    }
+
+
 class NEREntity(Enum):
    CREDIT_CARD = TagType.SENSITIVE.value
    EMAIL_ADDRESS = TagType.SENSITIVE.value
@ -42,7 +105,7 @@ class NEREntity(Enum):
    US_DRIVER_LICENSE = TagType.SENSITIVE.value
    DATE_TIME = TagType.NONSENSITIVE.value
    URL = TagType.SENSITIVE.value
-    US_BANK_NUMBER = TagType.NONSENSITIVE.value
+    US_BANK_NUMBER = TagType.SENSITIVE.value
    US_SSN = TagType.SENSITIVE.value
    PERSON = TagType.SENSITIVE.value
    US_PASSPORT = TagType.SENSITIVE.value
@ -78,11 +141,21 @@ class NERScanner:
                most_used_label_occurrence = score[1]
        return label_score or (None, None)

+    def column_name_scan(self, column_name: str):
+        for pii_type_pattern in ColumnNameScanner.sensitive_regex.values():
+            if pii_type_pattern.match(column_name) is not None:
+                return TagType.SENSITIVE.value, 1
+
+        for pii_type_pattern in ColumnNameScanner.non_sensitive_regex.values():
+            if pii_type_pattern.match(column_name) is not None:
+                return TagType.NONSENSITIVE.value, 1
+
+        return None
+
    def scan(self, text) -> Tuple[str, float]:
        """Scan the text and return an pii tag fqn and confidence/score"""

        logging.debug("Processing '%s'", text)
-        pii_tag_fqn = ""
        labels_score = {}
        self.text = [str(row) for row in text if row is not None]
        for row in self.text:
@ -106,19 +179,23 @@ class NERScanner:

        label, score = self.get_highest_score_label(labels_score)
        if label and score:
-            label_type = NEREntity.__members__.get(
+            tag_type = NEREntity.__members__.get(
                label, TagType.NONSENSITIVE.value
            ).value
-            pii_tag_fqn = fqn.build(
-                self.metadata,
-                entity_type=Tag,
-                classification_name=PII,
-                tag_name=label_type,
-            )
+            return tag_type, score

-        return pii_tag_fqn or "", score or 0
+        return "", 0

-    def process(self, table_data: TableData, table_entity: Table, client: OpenMetadata):
+    def process(
+        self,
+        table_data: TableData,
+        table_entity: Table,
+        client: OpenMetadata,
+        thresold_confidence: float,
+    ):
+        """
+        process function to start processing sample data
+        """
        len_of_rows = len(table_data.rows[0]) if table_data.rows else 0
        for idx in range(len_of_rows):
            pii_found = False
@ -128,11 +205,19 @@ class NERScanner:
                    continue
            if pii_found is True:
                continue
-            pii_tag_fqn, confidence = self.scan([row[idx] for row in table_data.rows])
-            if pii_tag_fqn and confidence >= 0.8:
+            tag_type, confidence = self.column_name_scan(
+                table_data.columns[idx].__root__
+            ) or self.scan([row[idx] for row in table_data.rows])
+            if tag_type and confidence >= thresold_confidence / 100:
+                tag_fqn = fqn.build(
+                    self.metadata,
+                    entity_type=Tag,
+                    classification_name=PII,
+                    tag_name=tag_type,
+                )
                client.patch_column_tag(
                    entity_id=table_entity.id,
                    column_name=table_entity.columns[idx].name.__root__,
-                    tag_fqn=pii_tag_fqn,
+                    tag_fqn=tag_fqn,
                    is_suggested=True,
                )
--- a/ingestion/src/metadata/profiler/profiler/core.py
+++ b/ingestion/src/metadata/profiler/profiler/core.py
@ -508,6 +508,7 @@ class Profiler(Generic[TMetric]):
                sample_data,
                self.profiler_interface.table_entity,  # type: ignore
                self.profiler_interface.ometa_client,  # type: ignore
+                self.profiler_interface.source_config.confidence,
            )
        except Exception as exc:
            logger.warning(
--- a/ingestion/tests/integration/utils/test_processor.py
+++ b/ingestion/tests/integration/utils/test_processor.py
@ -685,12 +685,10 @@ UPDATED_TABLE_ENTITY = [
        fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name",
        tags=[
            TagLabel(
-                tagFQN=TagFQN(__root__="PII.Sensitive"),
+                tagFQN=TagFQN(__root__="PII.NonSensitive"),
                description=(
-                    (
-                        "PII which if lost, compromised, or disclosed without authorization, could result in "
-                        "substantial harm, embarrassment, inconvenience, or unfairness to an individual."
-                    )
+                    "PII which is easily accessible from public sources and can include zip code, "
+                    "race, gender, and date of birth."
                ),
                source="Classification",
                labelType="Automated",
@ -716,7 +714,19 @@ UPDATED_TABLE_ENTITY = [
        dataTypeDisplay="varchar",
        description=None,
        fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name",
-        tags=[],
+        tags=[
+            TagLabel(
+                tagFQN=TagFQN(__root__="PII.NonSensitive"),
+                description=(
+                    "PII which is easily accessible from public sources and can include zip code, "
+                    "race, gender, and date of birth."
+                ),
+                source="Classification",
+                labelType="Automated",
+                state="Suggested",
+                href=None,
+            )
+        ],
        constraint=None,
        ordinalPosition=None,
        jsonSchema=None,
@ -884,7 +894,10 @@ class PiiProcessorTest(TestCase):
        TABLE_ENTITY.id = table_entity.id

        self.nerscanner_processor.process(
-            table_data=table_data, table_entity=TABLE_ENTITY, client=self.metadata
+            table_data=table_data,
+            table_entity=TABLE_ENTITY,
+            client=self.metadata,
+            thresold_confidence=85,
        )
        updated_table_entity = self.metadata.get_by_id(
            entity=Table, entity_id=table_entity.id, fields=["tags"]
--- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json
+++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json
@ -36,6 +36,11 @@
      "type": "boolean",
      "default": false
    },
+    "confidence": {
+        "description": "Set the Confidence value for which you want the column to be marked",
+        "type": "number",
+        "default": 80
+    },
    "generateSampleData": {
      "description": "Option to turn on/off generating sample data.",
      "type": "boolean",
--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx
@ -194,6 +194,7 @@ const AddIngestion = ({
      resultLimit: sourceConfig?.resultLimit ?? 1000,
      metadataToESConfig: undefined,
      dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false,
+      confidence: sourceConfig?.confidence,
      dbtClassificationName:
        sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema
    }),
@ -425,6 +426,7 @@ const AddIngestion = ({
      threadCount,
      timeoutSeconds,
      processPii,
+      confidence,
    } = state;
    switch (type) {
      case PipelineType.Usage: {
@ -460,6 +462,7 @@ const AddIngestion = ({
          type: profilerIngestionType,
          generateSampleData: ingestSampleData,
          profileSample: profileSample,
+          confidence: processPii ? confidence : undefined,
          profileSampleType: profileSampleType,
          threadCount: threadCount,
          timeoutSeconds: timeoutSeconds,
--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx
@ -103,6 +103,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = {
    threadCount: 5,
    timeoutSeconds: 43200,
    useFqnFilter: false,
+    confidence: 80,
  } as unknown as AddIngestionState,
 };

--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx
@ -85,6 +85,7 @@ const ConfigureIngestion = ({
    topicFilterPattern,
    useFqnFilter,
    processPii,
+    confidence,
    overrideOwner,
  } = useMemo(
    () => ({
@ -124,6 +125,7 @@ const ConfigureIngestion = ({
      useFqnFilter: data.useFqnFilter,
      processPii: data.processPii,
      overrideOwner: data.overrideOwner,
+      confidence: data.confidence,
    }),
    [data]
  );
@ -150,6 +152,11 @@ const ConfigureIngestion = ({
      profileSample: profileSample ?? undefined,
    });

+  const handleConfidenceScore = (confidence: number | undefined | null) =>
+    onChange({
+      confidence: confidence ?? undefined,
+    });
+
  const handleProfileSampleTypeChange = (value: ProfileSampleType) => {
    onChange({
      profileSampleType: value,
@ -481,20 +488,33 @@ const ConfigureIngestion = ({

  const getProcessPiiTogglesForProfiler = () => {
    return (
-      <Field>
-        <div className="tw-flex tw-gap-1">
-          <label>{t('label.auto-tag-pii-uppercase')}</label>
-          <ToggleSwitchV1
-            checked={processPii}
-            handleCheck={handleProcessPii}
-            testId="include-lineage"
-          />
-        </div>
-        <p className="tw-text-grey-muted tw-mt-3">
-          {t('message.process-pii-sensitive-column-message-profiler')}
-        </p>
-        {getSeparator('')}
-      </Field>
+      <Fragment>
+        <Field>
+          <div className="tw-flex tw-gap-1">
+            <label>{t('label.auto-tag-pii-uppercase')}</label>
+            <ToggleSwitchV1
+              checked={processPii}
+              handleCheck={handleProcessPii}
+              testId="include-lineage"
+            />
+          </div>
+          <p className="tw-text-grey-muted tw-mt-3">
+            {t('message.process-pii-sensitive-column-message-profiler')}
+          </p>
+          {processPii && (
+            <>
+              {getSeparator('')}
+              <Typography.Paragraph className="text-grey-muted m-t-0 m-b-xs text-sm">
+                {t('message.confidence-percentage-message')}
+              </Typography.Paragraph>
+              <SliderWithInput
+                value={confidence || 80}
+                onChange={handleConfidenceScore}
+              />
+            </>
+          )}
+        </Field>
+      </Fragment>
    );
  };

--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts
@ -135,6 +135,7 @@ export interface AddIngestionState {
  useFqnFilter: boolean;
  processPii: boolean;
  overrideOwner: boolean;
+  confidence?: number;
 }

 export enum ShowFilter {
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "closed this task",
    "collaborate-with-other-user": "to collaborate with other users.",
+    "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "closed this task",
    "collaborate-with-other-user": "to collaborate with other users.",
+    "confidence-percentage-message": "Entrer le niveau de confiance à utiliser pour le modèle NLP lorsque celui-ci détermine si un champs contain des données IIP.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "このタスクを閉じました",
    "collaborate-with-other-user": "to collaborate with other users.",
+    "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "closed this task",
    "collaborate-with-other-user": "to collaborate with other users.",
+    "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",