diff --git a/ingestion/src/metadata/ingestion/processor/pii.py b/ingestion/src/metadata/ingestion/processor/pii.py index 657d2ef73b6..88a91b3b4a8 100644 --- a/ingestion/src/metadata/ingestion/processor/pii.py +++ b/ingestion/src/metadata/ingestion/processor/pii.py @@ -13,8 +13,9 @@ Processor util to fetch pii sensitive columns """ import logging +import re import traceback -from enum import Enum +from enum import Enum, auto from typing import Optional, Tuple from metadata.generated.schema.entity.classification.tag import Tag @@ -25,11 +26,73 @@ from metadata.utils import fqn PII = "PII" +class PiiTypes(Enum): + """PiiTypes enumerates the different types of PII data""" + + NONE = auto() + UNSUPPORTED = auto() + PHONE = auto() + EMAIL = auto() + CREDIT_CARD = auto() + ADDRESS = auto() + ADDRESS_LOCATION = auto() + PERSON = auto() + LOCATION = auto() + BIRTH_DATE = auto() + GENDER = auto() + NATIONALITY = auto() + IP_ADDRESS = auto() + SSN = auto() + USER_NAME = auto() + PASSWORD = auto() + ETHNICITY = auto() + TAX_ID = auto() + KEY = auto() + BANKACC = auto() + + class TagType(Enum): SENSITIVE = "Sensitive" NONSENSITIVE = "NonSensitive" +class ColumnNameScanner: + """ + Column Name Scanner to scan column name + """ + + sensitive_regex = { + PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE), + PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE), + PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE), + PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE), + PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE), + PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE), + PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE), + } + non_sensitive_regex = { + PiiTypes.PERSON: re.compile( + "^.*(firstname|fname|lastname|lname|" + "fullname|maidenname|_name|" + "nickname|name_suffix|name).*$", + re.IGNORECASE, + ), + PiiTypes.BIRTH_DATE: re.compile( + "^.*(date_of_birth|dateofbirth|dob|" + "birthday|date_of_death|dateofdeath).*$", + re.IGNORECASE, + ), + PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE), + PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE), + PiiTypes.ADDRESS: re.compile( + "^.*(address|city|state|county|country|" + "zipcode|zip|postal|zone|borough).*$", + re.IGNORECASE, + ), + PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE), + } + + class NEREntity(Enum): CREDIT_CARD = TagType.SENSITIVE.value EMAIL_ADDRESS = TagType.SENSITIVE.value @@ -42,7 +105,7 @@ class NEREntity(Enum): US_DRIVER_LICENSE = TagType.SENSITIVE.value DATE_TIME = TagType.NONSENSITIVE.value URL = TagType.SENSITIVE.value - US_BANK_NUMBER = TagType.NONSENSITIVE.value + US_BANK_NUMBER = TagType.SENSITIVE.value US_SSN = TagType.SENSITIVE.value PERSON = TagType.SENSITIVE.value US_PASSPORT = TagType.SENSITIVE.value @@ -78,11 +141,21 @@ class NERScanner: most_used_label_occurrence = score[1] return label_score or (None, None) + def column_name_scan(self, column_name: str): + for pii_type_pattern in ColumnNameScanner.sensitive_regex.values(): + if pii_type_pattern.match(column_name) is not None: + return TagType.SENSITIVE.value, 1 + + for pii_type_pattern in ColumnNameScanner.non_sensitive_regex.values(): + if pii_type_pattern.match(column_name) is not None: + return TagType.NONSENSITIVE.value, 1 + + return None + def scan(self, text) -> Tuple[str, float]: """Scan the text and return an pii tag fqn and confidence/score""" logging.debug("Processing '%s'", text) - pii_tag_fqn = "" labels_score = {} self.text = [str(row) for row in text if row is not None] for row in self.text: @@ -106,19 +179,23 @@ class NERScanner: label, score = self.get_highest_score_label(labels_score) if label and score: - label_type = NEREntity.__members__.get( + tag_type = NEREntity.__members__.get( label, TagType.NONSENSITIVE.value ).value - pii_tag_fqn = fqn.build( - self.metadata, - entity_type=Tag, - classification_name=PII, - tag_name=label_type, - ) + return tag_type, score - return pii_tag_fqn or "", score or 0 + return "", 0 - def process(self, table_data: TableData, table_entity: Table, client: OpenMetadata): + def process( + self, + table_data: TableData, + table_entity: Table, + client: OpenMetadata, + thresold_confidence: float, + ): + """ + process function to start processing sample data + """ len_of_rows = len(table_data.rows[0]) if table_data.rows else 0 for idx in range(len_of_rows): pii_found = False @@ -128,11 +205,19 @@ class NERScanner: continue if pii_found is True: continue - pii_tag_fqn, confidence = self.scan([row[idx] for row in table_data.rows]) - if pii_tag_fqn and confidence >= 0.8: + tag_type, confidence = self.column_name_scan( + table_data.columns[idx].__root__ + ) or self.scan([row[idx] for row in table_data.rows]) + if tag_type and confidence >= thresold_confidence / 100: + tag_fqn = fqn.build( + self.metadata, + entity_type=Tag, + classification_name=PII, + tag_name=tag_type, + ) client.patch_column_tag( entity_id=table_entity.id, column_name=table_entity.columns[idx].name.__root__, - tag_fqn=pii_tag_fqn, + tag_fqn=tag_fqn, is_suggested=True, ) diff --git a/ingestion/src/metadata/profiler/profiler/core.py b/ingestion/src/metadata/profiler/profiler/core.py index 75ff65aca37..33ced5075d5 100644 --- a/ingestion/src/metadata/profiler/profiler/core.py +++ b/ingestion/src/metadata/profiler/profiler/core.py @@ -508,6 +508,7 @@ class Profiler(Generic[TMetric]): sample_data, self.profiler_interface.table_entity, # type: ignore self.profiler_interface.ometa_client, # type: ignore + self.profiler_interface.source_config.confidence, ) except Exception as exc: logger.warning( diff --git a/ingestion/tests/integration/utils/test_processor.py b/ingestion/tests/integration/utils/test_processor.py index 041cac1f974..2e1b9851302 100644 --- a/ingestion/tests/integration/utils/test_processor.py +++ b/ingestion/tests/integration/utils/test_processor.py @@ -685,12 +685,10 @@ UPDATED_TABLE_ENTITY = [ fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.first_name", tags=[ TagLabel( - tagFQN=TagFQN(__root__="PII.Sensitive"), + tagFQN=TagFQN(__root__="PII.NonSensitive"), description=( - ( - "PII which if lost, compromised, or disclosed without authorization, could result in " - "substantial harm, embarrassment, inconvenience, or unfairness to an individual." - ) + "PII which is easily accessible from public sources and can include zip code, " + "race, gender, and date of birth." ), source="Classification", labelType="Automated", @@ -716,7 +714,19 @@ UPDATED_TABLE_ENTITY = [ dataTypeDisplay="varchar", description=None, fullyQualifiedName="test-service-table-patch.test-db.test-schema.customers.last_name", - tags=[], + tags=[ + TagLabel( + tagFQN=TagFQN(__root__="PII.NonSensitive"), + description=( + "PII which is easily accessible from public sources and can include zip code, " + "race, gender, and date of birth." + ), + source="Classification", + labelType="Automated", + state="Suggested", + href=None, + ) + ], constraint=None, ordinalPosition=None, jsonSchema=None, @@ -884,7 +894,10 @@ class PiiProcessorTest(TestCase): TABLE_ENTITY.id = table_entity.id self.nerscanner_processor.process( - table_data=table_data, table_entity=TABLE_ENTITY, client=self.metadata + table_data=table_data, + table_entity=TABLE_ENTITY, + client=self.metadata, + thresold_confidence=85, ) updated_table_entity = self.metadata.get_by_id( entity=Table, entity_id=table_entity.id, fields=["tags"] diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json index b8e6d3f4081..00f798f7a9d 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json @@ -36,6 +36,11 @@ "type": "boolean", "default": false }, + "confidence": { + "description": "Set the Confidence value for which you want the column to be marked", + "type": "number", + "default": 80 + }, "generateSampleData": { "description": "Option to turn on/off generating sample data.", "type": "boolean", diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx index 85a859d8c76..e112c31088a 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx @@ -194,6 +194,7 @@ const AddIngestion = ({ resultLimit: sourceConfig?.resultLimit ?? 1000, metadataToESConfig: undefined, dbtUpdateDescriptions: sourceConfig?.dbtUpdateDescriptions ?? false, + confidence: sourceConfig?.confidence, dbtClassificationName: sourceConfig?.dbtClassificationName ?? DBT_CLASSIFICATION_DEFAULT_VALUE, // default value from Json Schema }), @@ -425,6 +426,7 @@ const AddIngestion = ({ threadCount, timeoutSeconds, processPii, + confidence, } = state; switch (type) { case PipelineType.Usage: { @@ -460,6 +462,7 @@ const AddIngestion = ({ type: profilerIngestionType, generateSampleData: ingestSampleData, profileSample: profileSample, + confidence: processPii ? confidence : undefined, profileSampleType: profileSampleType, threadCount: threadCount, timeoutSeconds: timeoutSeconds, diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx index ebbba976849..9f1f87b76dd 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx @@ -103,6 +103,7 @@ const mockConfigureIngestion: ConfigureIngestionProps = { threadCount: 5, timeoutSeconds: 43200, useFqnFilter: false, + confidence: 80, } as unknown as AddIngestionState, }; diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx index 3162e259434..3bb6fc7d858 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx @@ -85,6 +85,7 @@ const ConfigureIngestion = ({ topicFilterPattern, useFqnFilter, processPii, + confidence, overrideOwner, } = useMemo( () => ({ @@ -124,6 +125,7 @@ const ConfigureIngestion = ({ useFqnFilter: data.useFqnFilter, processPii: data.processPii, overrideOwner: data.overrideOwner, + confidence: data.confidence, }), [data] ); @@ -150,6 +152,11 @@ const ConfigureIngestion = ({ profileSample: profileSample ?? undefined, }); + const handleConfidenceScore = (confidence: number | undefined | null) => + onChange({ + confidence: confidence ?? undefined, + }); + const handleProfileSampleTypeChange = (value: ProfileSampleType) => { onChange({ profileSampleType: value, @@ -481,20 +488,33 @@ const ConfigureIngestion = ({ const getProcessPiiTogglesForProfiler = () => { return ( - -
- - -
-

- {t('message.process-pii-sensitive-column-message-profiler')} -

- {getSeparator('')} -
+ + +
+ + +
+

+ {t('message.process-pii-sensitive-column-message-profiler')} +

+ {processPii && ( + <> + {getSeparator('')} + + {t('message.confidence-percentage-message')} + + + + )} +
+
); }; diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts index 1717f012bf5..98b9aec8160 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts @@ -135,6 +135,7 @@ export interface AddIngestionState { useFqnFilter: boolean; processPii: boolean; overrideOwner: boolean; + confidence?: number; } export enum ShowFilter { diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json index 24e767a3324..3185c1ecadd 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json @@ -910,6 +910,7 @@ "click-text-to-view-details": "Click <0>{{text}} to view details.", "closed-this-task": "closed this task", "collaborate-with-other-user": "to collaborate with other users.", + "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json index 604519d6273..e7969e08ef4 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json @@ -910,6 +910,7 @@ "click-text-to-view-details": "Click <0>{{text}} to view details.", "closed-this-task": "closed this task", "collaborate-with-other-user": "to collaborate with other users.", + "confidence-percentage-message": "Entrer le niveau de confiance à utiliser pour le modèle NLP lorsque celui-ci détermine si un champs contain des données IIP.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json index 527841f383c..3d70f58cd12 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/ja-jp.json @@ -910,6 +910,7 @@ "click-text-to-view-details": "Click <0>{{text}} to view details.", "closed-this-task": "このタスクを閉じました", "collaborate-with-other-user": "to collaborate with other users.", + "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.", diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json index 34621f884cf..65862331ba7 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/zh-cn.json @@ -910,6 +910,7 @@ "click-text-to-view-details": "Click <0>{{text}} to view details.", "closed-this-task": "closed this task", "collaborate-with-other-user": "to collaborate with other users.", + "confidence-percentage-message": "Set the confidence level for the NLP model to use when infering whether a column contains PII data or not.", "configure-a-service-description": "Enter a unique service name. The name must be unique across the category of services. For e.g., among database services, both MySQL and Snowflake cannot have the same service name (E.g. customer_data). However, different service categories (dashboard, pipeline) can have the same service name. Spaces are not supported in the service name. Characters like - _ are supported. Also, add a description.", "configure-airflow": "To set up metadata extraction through UI, you first need to configure and connect to Airflow. For more details visit our", "configure-dbt-model-description": "A dbt model provides transformation logic that creates a table from raw data. Lineage traces the path of data across tables, but a dbt model provides specifics. Select the required dbt source provider and fill in the mandatory fields. Integrate with dbt from OpenMetadata to view the models used to generate tables.",