improvement in pii tagging (#10696)

* improvement in pii tagging * fix conflict and changes as per comment * Added confidence field * changes as per comments * Apply suggestions from code review Co-authored-by: Teddy <teddy.crepineau@gmail.com> --------- Co-authored-by: Ashish Gupta <ashish@getcollate.io> Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com> Co-authored-by: Teddy <teddy.crepineau@gmail.com>
2025-12-20 12:09:05 +00:00 · 2023-03-28 19:37:48 +05:30 · 2023-03-28 19:37:48 +05:30 · 46afe69811
commit 46afe69811
parent 4853dfc6c8
12 changed files with 169 additions and 36 deletions
--- a/ingestion/src/metadata/ingestion/processor/pii.py
+++ b/ingestion/src/metadata/ingestion/processor/pii.py
@ -13,8 +13,9 @@
 Processor util to fetch pii sensitive columns
 """
 import logging
 import re
 import traceback
-from enum import Enum
+from enum import Enum, auto
 from typing import Optional, Tuple
 from metadata.generated.schema.entity.classification.tag import Tag
@ -25,11 +26,73 @@ from metadata.utils import fqn
 PII = "PII"
 class PiiTypes(Enum):
    """PiiTypes enumerates the different types of PII data"""
    NONE = auto()
    UNSUPPORTED = auto()
    PHONE = auto()
    EMAIL = auto()
    CREDIT_CARD = auto()
    ADDRESS = auto()
    ADDRESS_LOCATION = auto()
    PERSON = auto()
    LOCATION = auto()
    BIRTH_DATE = auto()
    GENDER = auto()
    NATIONALITY = auto()
    IP_ADDRESS = auto()
    SSN = auto()
    USER_NAME = auto()
    PASSWORD = auto()
    ETHNICITY = auto()
    TAX_ID = auto()
    KEY = auto()
    BANKACC = auto()
 class TagType(Enum):
    SENSITIVE = "Sensitive"
    NONSENSITIVE = "NonSensitive"
 class ColumnNameScanner:
    """
    Column Name Scanner to scan column name
    """
    sensitive_regex = {
        PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
        PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE),
        PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE),
        PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
        PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE),
        PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE),
        PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
    }
    non_sensitive_regex = {
        PiiTypes.PERSON: re.compile(
            "^.*(firstname|fname|lastname|lname|"
            "fullname|maidenname|_name|"
            "nickname|name_suffix|name).*$",
            re.IGNORECASE,
        ),
        PiiTypes.BIRTH_DATE: re.compile(
            "^.*(date_of_birth|dateofbirth|dob|"
            "birthday|date_of_death|dateofdeath).*$",
            re.IGNORECASE,
        ),
        PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
        PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
        PiiTypes.ADDRESS: re.compile(
            "^.*(address|city|state|county|country|"
            "zipcode|zip|postal|zone|borough).*$",
            re.IGNORECASE,
        ),
        PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
    }
 class NEREntity(Enum):
    CREDIT_CARD = TagType.SENSITIVE.value
    EMAIL_ADDRESS = TagType.SENSITIVE.value
@ -42,7 +105,7 @@ class NEREntity(Enum):
    US_DRIVER_LICENSE = TagType.SENSITIVE.value
    DATE_TIME = TagType.NONSENSITIVE.value
    URL = TagType.SENSITIVE.value
-    US_BANK_NUMBER = TagType.NONSENSITIVE.value
+    US_BANK_NUMBER = TagType.SENSITIVE.value
    US_SSN = TagType.SENSITIVE.value
    PERSON = TagType.SENSITIVE.value
    US_PASSPORT = TagType.SENSITIVE.value
@ -78,11 +141,21 @@ class NERScanner:
                most_used_label_occurrence = score[1]
        return label_score or (None, None)
    def column_name_scan(self, column_name: str):
        for pii_type_pattern in ColumnNameScanner.sensitive_regex.values():
            if pii_type_pattern.match(column_name) is not None:
                return TagType.SENSITIVE.value, 1
        for pii_type_pattern in ColumnNameScanner.non_sensitive_regex.values():
            if pii_type_pattern.match(column_name) is not None:
                return TagType.NONSENSITIVE.value, 1
        return None
    def scan(self, text) -> Tuple[str, float]:
        """Scan the text and return an pii tag fqn and confidence/score"""
        logging.debug("Processing '%s'", text)
        pii_tag_fqn = ""
        labels_score = {}
        self.text = [str(row) for row in text if row is not None]
        for row in self.text:
@ -106,19 +179,23 @@ class NERScanner:
        label, score = self.get_highest_score_label(labels_score)
        if label and score:
-            label_type = NEREntity.__members__.get(
+            tag_type = NEREntity.__members__.get(
                label, TagType.NONSENSITIVE.value
            ).value
-            pii_tag_fqn = fqn.build(
+            return tag_type, score
                self.metadata,
                entity_type=Tag,
                classification_name=PII,
                tag_name=label_type,
            )
-        return pii_tag_fqn or "", score or 0
+        return "", 0
-    def process(self, table_data: TableData, table_entity: Table, client: OpenMetadata):
+    def process(
        self,
        table_data: TableData,
        table_entity: Table,
        client: OpenMetadata,
        thresold_confidence: float,
    ):
        """
        process function to start processing sample data
        """
        len_of_rows = len(table_data.rows[0]) if table_data.rows else 0
        for idx in range(len_of_rows):
            pii_found = False
@ -128,11 +205,19 @@ class NERScanner:
                    continue
            if pii_found is True:
                continue
-            pii_tag_fqn, confidence = self.scan([row[idx] for row in table_data.rows])
+            tag_type, confidence = self.column_name_scan(
-            if pii_tag_fqn and confidence >= 0.8:
+                table_data.columns[idx].__root__
            ) or self.scan([row[idx] for row in table_data.rows])
            if tag_type and confidence >= thresold_confidence / 100:
                tag_fqn = fqn.build(
                    self.metadata,
                    entity_type=Tag,
                    classification_name=PII,
                    tag_name=tag_type,
                )
                client.patch_column_tag(
                    entity_id=table_entity.id,
                    column_name=table_entity.columns[idx].name.__root__,
-                    tag_fqn=pii_tag_fqn,
+                    tag_fqn=tag_fqn,
                    is_suggested=True,
                )
--- a/ingestion/src/metadata/profiler/profiler/core.py
+++ b/ingestion/src/metadata/profiler/profiler/core.py
@ -508,6 +508,7 @@ class Profiler(Generic[TMetric]):
                sample_data,
                self.profiler_interface.table_entity,  # type: ignore
                self.profiler_interface.ometa_client,  # type: ignore
                self.profiler_interface.source_config.confidence,
            )
        except Exception as exc:
            logger.warning(
--- a/ingestion/tests/integration/utils/test_processor.py
+++ b/ingestion/tests/integration/utils/test_processor.py
@ -685,12 +685,10 @@ UPDATED_TABLE_ENTITY = [
        fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name",
        tags=[
            TagLabel(
-                tagFQN=TagFQN(__root__="PII.Sensitive"),
+                tagFQN=TagFQN(__root__="PII.NonSensitive"),
                description=(
-                    (
+                    "PII which is easily accessible from public sources and can include zip code, "
-                        "PII which if lost, compromised, or disclosed without authorization, could result in "
+                    "race, gender, and date of birth."
                        "substantial harm, embarrassment, inconvenience, or unfairness to an individual."
                    )
                ),
                source="Classification",
                labelType="Automated",
@ -716,7 +714,19 @@ UPDATED_TABLE_ENTITY = [
        dataTypeDisplay="varchar",
        description=None,
        fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name",
-        tags=[],
+        tags=[
            TagLabel(
                tagFQN=TagFQN(__root__="PII.NonSensitive"),
                description=(
                    "PII which is easily accessible from public sources and can include zip code, "
                    "race, gender, and date of birth."
                ),
                source="Classification",
                labelType="Automated",
                state="Suggested",
                href=None,
            )
        ],
        constraint=None,
        ordinalPosition=None,
        jsonSchema=None,
@ -884,7 +894,10 @@ class PiiProcessorTest(TestCase):
        TABLE_ENTITY.id = table_entity.id
        self.nerscanner_processor.process(
-            table_data=table_data, table_entity=TABLE_ENTITY, client=self.metadata
+            table_data=table_data,
            table_entity=TABLE_ENTITY,
            client=self.metadata,
            thresold_confidence=85,
        )
        updated_table_entity = self.metadata.get_by_id(
            entity=Table, entity_id=table_entity.id, fields=["tags"]
--- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json
+++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json
@ -36,6 +36,11 @@
      "type": "boolean",
      "default": false
    },
    "confidence": {
        "description": "Set the Confidence value for which you want the column to be marked",
        "type": "number",
        "default": 80
    },
    "generateSampleData": {
      "description": "Option to turn on/off generating sample data.",
      "type": "boolean",
--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx
@ -194,6 +194,7 @@ const AddIngestion = ({
      resultLimit: sourceConfig?.resultLimit ?? 1000,
      metadataToESConfig: undefined,
      dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false,
      confidence: sourceConfig?.confidence,
      dbtClassificationName:
        sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema
    }),
@ -425,6 +426,7 @@ const AddIngestion = ({
      threadCount,
      timeoutSeconds,
      processPii,
      confidence,
    } = state;
    switch (type) {
      case PipelineType.Usage: {
@ -460,6 +462,7 @@ const AddIngestion = ({
          type: profilerIngestionType,
          generateSampleData: ingestSampleData,
          profileSample: profileSample,
          confidence: processPii ? confidence : undefined,
          profileSampleType: profileSampleType,
          threadCount: threadCount,
          timeoutSeconds: timeoutSeconds,
--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx
@ -103,6 +103,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = {
    threadCount: 5,
    timeoutSeconds: 43200,
    useFqnFilter: false,
    confidence: 80,
  } as unknown as AddIngestionState,
 };
--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx
@ -85,6 +85,7 @@ const ConfigureIngestion = ({
    topicFilterPattern,
    useFqnFilter,
    processPii,
    confidence,
    overrideOwner,
  } = useMemo(
    () => ({
@ -124,6 +125,7 @@ const ConfigureIngestion = ({
      useFqnFilter: data.useFqnFilter,
      processPii: data.processPii,
      overrideOwner: data.overrideOwner,
      confidence: data.confidence,
    }),
    [data]
  );
@ -150,6 +152,11 @@ const ConfigureIngestion = ({
      profileSample: profileSample ?? undefined,
    });
  const handleConfidenceScore = (confidence: number | undefined | null) =>
    onChange({
      confidence: confidence ?? undefined,
    });
  const handleProfileSampleTypeChange = (value: ProfileSampleType) => {
    onChange({
      profileSampleType: value,
@ -481,20 +488,33 @@ const ConfigureIngestion = ({
  const getProcessPiiTogglesForProfiler = () => {
    return (
-      <Field>
+      <Fragment>
-        <div className="tw-flex tw-gap-1">
+        <Field>
-          <label>{t('label.auto-tag-pii-uppercase')}</label>
+          <div className="tw-flex tw-gap-1">
-          <ToggleSwitchV1
+            <label>{t('label.auto-tag-pii-uppercase')}</label>
-            checked={processPii}
+            <ToggleSwitchV1
-            handleCheck={handleProcessPii}
+              checked={processPii}
-            testId="include-lineage"
+              handleCheck={handleProcessPii}
-          />
+              testId="include-lineage"
-        </div>
+            />
-        <p className="tw-text-grey-muted tw-mt-3">
+          </div>
-          {t('message.process-pii-sensitive-column-message-profiler')}
+          <p className="tw-text-grey-muted tw-mt-3">
-        </p>
+            {t('message.process-pii-sensitive-column-message-profiler')}
-        {getSeparator('')}
+          </p>
-      </Field>
+          {processPii && (
            <>
              {getSeparator('')}
              <Typography.Paragraph className="text-grey-muted m-t-0 m-b-xs text-sm">
                {t('message.confidence-percentage-message')}
              </Typography.Paragraph>
              <SliderWithInput
                value={confidence || 80}
                onChange={handleConfidenceScore}
              />
            </>
          )}
        </Field>
      </Fragment>
    );
  };
--- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts
+++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts
@ -135,6 +135,7 @@ export interface AddIngestionState {
  useFqnFilter: boolean;
  processPii: boolean;
  overrideOwner: boolean;
  confidence?: number;
 }
 export enum ShowFilter {
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "closed this task",
    "collaborate-with-other-user": "to collaborate with other users.",
    "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "closed this task",
    "collaborate-with-other-user": "to collaborate with other users.",
    "confidence-percentage-message": "Entrer le niveau de confiance à utiliser pour le modèle NLP lorsque celui-ci détermine si un champs contain des données IIP.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "このタスクを閉じました",
    "collaborate-with-other-user": "to collaborate with other users.",
    "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",
--- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json
+++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json
@ -910,6 +910,7 @@
    "click-text-to-view-details": "Click <0>{{text}}</0> to view details.",
    "closed-this-task": "closed this task",
    "collaborate-with-other-user": "to collaborate with other users.",
    "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.",
    "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.",
    "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our",
    "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the  required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",