From 34a0cc147e185f8cc1a18fd8ec77c0964ef506d9 Mon Sep 17 00:00:00 2001 From: NiharDoshi99 <51595473+NiharDoshi99@users.noreply.github.com> Date: Wed, 8 Feb 2023 21:30:47 +0530 Subject: [PATCH] Fix: Added changes for Pii sensitive (#10119) * Fix: added changes for pii sensitive * Fix: removed comments * Fix: python checkstyle * differtiate between sensitive and non sensitive tag * fix: python test * fix: added tests * fix: maven CI --- .../source/database/common_db_source.py | 7 + .../source/database/database_service.py | 8 + .../source/database/datalake/metadata.py | 3 + .../source/database/deltalake/metadata.py | 4 +- .../source/database/domodatabase/metadata.py | 3 + .../source/database/dynamodb/metadata.py | 4 +- .../source/database/glue/metadata.py | 3 + .../ingestion/source/database/processor.py | 197 +++++++++ .../source/database/salesforce/metadata.py | 3 + .../tests/integration/utils/test_processor.py | 397 ++++++++++++++++++ .../databaseServiceMetadataPipeline.json | 5 + .../AddIngestion/AddIngestion.component.tsx | 3 + .../Steps/ConfigureIngestion.test.tsx | 2 +- .../AddIngestion/Steps/ConfigureIngestion.tsx | 24 ++ .../AddIngestion/addIngestion.interface.ts | 1 + .../ui/src/locale/languages/en-us.json | 2 + 16 files changed, 663 insertions(+), 3 deletions(-) create mode 100644 ingestion/src/metadata/ingestion/source/database/processor.py create mode 100644 ingestion/tests/integration/utils/test_processor.py diff --git a/ingestion/src/metadata/ingestion/source/database/common_db_source.py b/ingestion/src/metadata/ingestion/source/database/common_db_source.py index c1a99385bc5..eba2e8d900b 100644 --- a/ingestion/src/metadata/ingestion/source/database/common_db_source.py +++ b/ingestion/src/metadata/ingestion/source/database/common_db_source.py @@ -52,6 +52,7 @@ from metadata.ingestion.source.database.database_service import ( DatabaseServiceSource, SQLSourceStatus, ) +from metadata.ingestion.source.database.processor import PiiProcessor from metadata.ingestion.source.database.sql_column_handler import SqlColumnHandlerMixin from metadata.ingestion.source.database.sqlalchemy_source import SqlAlchemySource from metadata.ingestion.source.models import TableView @@ -372,6 +373,12 @@ class CommonDbSourceService( table_name=table_name ), # Pick tags from context info, if any ) + + # Process pii sensitive column and append tags + if self.source_config.processPiiSensitive: + processor = PiiProcessor(metadata_config=self.metadata) + processor.process(table_request) + is_partitioned, partition_details = self.get_table_partition_details( table_name=table_name, schema_name=schema_name, inspector=self.inspector ) diff --git a/ingestion/src/metadata/ingestion/source/database/database_service.py b/ingestion/src/metadata/ingestion/source/database/database_service.py index 9b1a036b537..fa7c4129aec 100644 --- a/ingestion/src/metadata/ingestion/source/database/database_service.py +++ b/ingestion/src/metadata/ingestion/source/database/database_service.py @@ -67,6 +67,7 @@ from metadata.ingestion.models.topology import ( create_source_context, ) from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.ingestion.source.database.processor import PiiProcessor from metadata.utils import fqn from metadata.utils.filters import filter_by_schema from metadata.utils.logger import ingestion_logger @@ -490,3 +491,10 @@ class DatabaseServiceSource( ) yield from self.delete_schema_tables(schema_fqn) + + def process_pii_sensitive_column( + self, metadata_config: OpenMetadata, table_request: CreateTableRequest + ): + if self.source_config.processPiiSensitive: + processer = PiiProcessor(metadata_config=metadata_config) + processer.process(table_request=table_request) diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py b/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py index e67aa0fb3d7..7299aa56e7d 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py @@ -432,6 +432,9 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public- type="databaseSchema", ), ) + self.process_pii_sensitive_column( + metadata_config=self.metadata, table_request=table_request + ) yield table_request self.register_record(table_request=table_request) except Exception as exc: diff --git a/ingestion/src/metadata/ingestion/source/database/deltalake/metadata.py b/ingestion/src/metadata/ingestion/source/database/deltalake/metadata.py index f9a7a57bb3c..bca1ab8c023 100644 --- a/ingestion/src/metadata/ingestion/source/database/deltalake/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/deltalake/metadata.py @@ -272,7 +272,9 @@ class DeltalakeSource(DatabaseServiceSource): ), viewDefinition=view_definition, ) - + self.process_pii_sensitive_column( + metadata_config=self.metadata, table_request=table_request + ) yield table_request self.register_record(table_request=table_request) diff --git a/ingestion/src/metadata/ingestion/source/database/domodatabase/metadata.py b/ingestion/src/metadata/ingestion/source/database/domodatabase/metadata.py index 091cdd30c33..f226fe767d9 100644 --- a/ingestion/src/metadata/ingestion/source/database/domodatabase/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/domodatabase/metadata.py @@ -161,6 +161,9 @@ class DomodatabaseSource(DatabaseServiceSource): type="databaseSchema", ), ) + self.process_pii_sensitive_column( + metadata_config=self.metadata, table_request=table_request + ) yield table_request self.register_record(table_request=table_request) except Exception as exc: diff --git a/ingestion/src/metadata/ingestion/source/database/dynamodb/metadata.py b/ingestion/src/metadata/ingestion/source/database/dynamodb/metadata.py index 04a8213912d..7f810596a98 100644 --- a/ingestion/src/metadata/ingestion/source/database/dynamodb/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/dynamodb/metadata.py @@ -205,7 +205,9 @@ class DynamodbSource(DatabaseServiceSource): type="databaseSchema", ), ) - + self.process_pii_sensitive_column( + metadata_config=self.metadata, table_request=table_request + ) yield table_request self.register_record(table_request=table_request) diff --git a/ingestion/src/metadata/ingestion/source/database/glue/metadata.py b/ingestion/src/metadata/ingestion/source/database/glue/metadata.py index cc231400ded..6d49aaf057f 100755 --- a/ingestion/src/metadata/ingestion/source/database/glue/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/glue/metadata.py @@ -287,6 +287,9 @@ class GlueSource(DatabaseServiceSource): type="databaseSchema", ), ) + self.process_pii_sensitive_column( + metadata_config=self.metadata, table_request=table_request + ) yield table_request self.register_record(table_request=table_request) except Exception as exc: diff --git a/ingestion/src/metadata/ingestion/source/database/processor.py b/ingestion/src/metadata/ingestion/source/database/processor.py new file mode 100644 index 00000000000..671df1cf69b --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/processor.py @@ -0,0 +1,197 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Processor util to fetch pii sensitive columns +""" +import logging +import re +from abc import ABC, abstractmethod +from enum import Enum, auto +from typing import List, Optional + +from commonregex import CommonRegex +from pydantic import BaseModel + +from metadata.generated.schema.api.data.createTable import CreateTableRequest +from metadata.generated.schema.type.tagLabel import ( + LabelType, + State, + TagLabel, + TagSource, +) +from metadata.ingestion.api.processor import Processor, ProcessorStatus +from metadata.ingestion.ometa.ometa_api import OpenMetadata, OpenMetadataConnection + +PII = "PII" + + +class PiiTypes(Enum): + """PiiTypes enumerates the different types of PII data""" + + NONE = auto() + UNSUPPORTED = auto() + PHONE = auto() + EMAIL = auto() + CREDIT_CARD = auto() + ADDRESS = auto() + ADDRESS_LOCATION = auto() + PERSON = auto() + LOCATION = auto() + BIRTH_DATE = auto() + GENDER = auto() + NATIONALITY = auto() + IP_ADDRESS = auto() + SSN = auto() + USER_NAME = auto() + PASSWORD = auto() + ETHNICITY = auto() + TAX_ID = auto() + KEY = auto() + BANKACC = auto() + + +class TagType(Enum): + SENSITIVE = "Sensitive" + NONSENSITIVE = "NonSensitive" + + +class ColumnPIIType(BaseModel): + pii_types: PiiTypes + tag_type: TagType + + +class Scanner(ABC): + @abstractmethod + def scan(self, text): + """scan the text and return array of PiiTypes that are found""" + + +class RegexScanner(Scanner): + """A scanner that uses commmon regular expressions to find PII""" + + def scan(self, text): + """Scan the text and return an array of PiiTypes that are found""" + regex_result = CommonRegex(text) + types = [] + if regex_result.phones: # pylint: disable=no-member + types.append(PiiTypes.PHONE.name) + if regex_result.emails: # pylint: disable=no-member + types.append(PiiTypes.EMAIL.name) + if regex_result.credit_cards: # pylint: disable=no-member + types.append(PiiTypes.CREDIT_CARD.name) + if regex_result.street_addresses: # pylint: disable=no-member + types.append(PiiTypes.ADDRESS.name) + + return types + + +class ColumnNameScanner(Scanner): + """ + Column Name Scanner to scan column name + """ + + sensitive_regex = { + PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE), + PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE), + PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE), + PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE), + PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE), + PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE), + PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE), + } + non_sensitive_regex = { + PiiTypes.PERSON: re.compile( + "^.*(firstname|fname|lastname|lname|" + "fullname|maidenname|_name|" + "nickname|name_suffix|name).*$", + re.IGNORECASE, + ), + PiiTypes.BIRTH_DATE: re.compile( + "^.*(date_of_birth|dateofbirth|dob|" + "birthday|date_of_death|dateofdeath).*$", + re.IGNORECASE, + ), + PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE), + PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE), + PiiTypes.ADDRESS: re.compile( + "^.*(address|city|state|county|country|" + "zipcode|zip|postal|zone|borough).*$", + re.IGNORECASE, + ), + PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE), + } + + def scan(self, text) -> Optional[List[ColumnPIIType]]: + types = set() + for pii_type_keys, pii_type_pattern in self.sensitive_regex.items(): + if pii_type_pattern.match(text) is not None: + return ColumnPIIType( + pii_types=pii_type_keys, tag_type=TagType.SENSITIVE.value + ) + + for pii_type_keys, pii_type_pattern in self.non_sensitive_regex.items(): + if pii_type_pattern.match(text) is not None: + return ColumnPIIType( + pii_types=pii_type_keys, tag_type=TagType.NONSENSITIVE.value + ) + + logging.debug("PiiTypes are %s", ",".join(str(x) for x in list(types))) + return None + + +class PiiProcessor(Processor): + """ + Processor class to process columns of table + """ + + metadata_config: OpenMetadata + status: ProcessorStatus + metadata: OpenMetadata + + def __init__(self, metadata_config: OpenMetadata): + super().__init__() + self.metadata = metadata_config + self.status = ProcessorStatus() + self.column_scanner = ColumnNameScanner() + + @classmethod + def create(cls, config_dict: dict): # pylint: disable=arguments-differ + metadata_config = OpenMetadataConnection.parse_obj(config_dict) + return cls(metadata_config) + + def process( # pylint: disable=arguments-differ + self, table_request: CreateTableRequest + ) -> Optional[CreateTableRequest]: + for column in table_request.columns: + pii_tags = [] + pii_tags: ColumnPIIType = self.column_scanner.scan(column.name.__root__) + tag_labels = [] + if pii_tags: + tag_labels.append( + TagLabel( + tagFQN=f"{PII}.{pii_tags.tag_type.value}", + labelType=LabelType.Automated.value, + state=State.Suggested.value, + source=TagSource.Tag.value, + ) + ) + if len(tag_labels) > 0 and column.tags: + column.tags.extend(tag_labels) + elif len(tag_labels) > 0: + column.tags = tag_labels + self.status.records.append(column.name.__root__) + + def close(self): + pass + + def get_status(self) -> ProcessorStatus: + return self.status diff --git a/ingestion/src/metadata/ingestion/source/database/salesforce/metadata.py b/ingestion/src/metadata/ingestion/source/database/salesforce/metadata.py index 0cc7384a298..1bf538235fd 100644 --- a/ingestion/src/metadata/ingestion/source/database/salesforce/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/salesforce/metadata.py @@ -206,6 +206,9 @@ class SalesforceSource(DatabaseServiceSource): type="databaseSchema", ), ) + self.process_pii_sensitive_column( + metadata_config=self.metadata, table_request=table_request + ) yield table_request self.register_record(table_request=table_request) diff --git a/ingestion/tests/integration/utils/test_processor.py b/ingestion/tests/integration/utils/test_processor.py new file mode 100644 index 00000000000..e37d813e053 --- /dev/null +++ b/ingestion/tests/integration/utils/test_processor.py @@ -0,0 +1,397 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test Processor Class +""" + +from unittest import TestCase + +from metadata.generated.schema.api.data.createTable import CreateTableRequest +from metadata.generated.schema.entity.data.table import Column, DataType, TableType +from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( + OpenMetadataConnection, +) +from metadata.generated.schema.security.client.openMetadataJWTClientConfig import ( + OpenMetadataJWTClientConfig, +) +from metadata.generated.schema.type.entityReference import EntityReference +from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.ingestion.source.database.processor import PiiProcessor + +MOCK_TABLE: CreateTableRequest = CreateTableRequest( + name="DataSet Input", + displayName="DataSet Input", + description="this is a description for dataset input", + tableType=TableType.Regular.value, + columns=[ + Column( + name="amount", + displayName=None, + dataType=DataType.DOUBLE.value, + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="This is description for amount", + fullyQualifiedName=None, + tags=None, + constraint=None, + ordinalPosition=1, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="bank_transfer_amount", + displayName=None, + dataType=DataType.DOUBLE.value, + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=None, + constraint=None, + ordinalPosition=2, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="coupon_amount", + displayName=None, + dataType=DataType.DOUBLE.value, + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=None, + constraint=None, + ordinalPosition=3, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="credit_card_amount", + displayName=None, + dataType=DataType.DOUBLE.value, + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN="PersonalData.Personal", + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=4, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="FirstName", + displayName=None, + dataType=DataType.STRING.value, + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=None, + constraint=None, + ordinalPosition=4, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="is_customer", + displayName=None, + dataType=DataType.BOOLEAN.value, + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN="PersonalData.Personal", + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=4, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + ], + tableConstraints=None, + tablePartition=None, + tableProfilerConfig=None, + owner=None, + databaseSchema=EntityReference( + id="c3eb265f-5445-4ad3-ba5e-797d3a3071bb", + type="databaseSchema", + name=None, + fullyQualifiedName=None, + description=None, + displayName=None, + deleted=None, + href=None, + ), + tags=None, + viewDefinition=None, + extension=None, +) + +EXPECTED_COLUMNS = [ + Column( + name="amount", + displayName=None, + dataType="DOUBLE", + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="This is description for amount", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN=TagFQN(__root__="PII.Sensitive"), + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=1, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="bank_transfer_amount", + displayName=None, + dataType="DOUBLE", + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN=TagFQN(__root__="PII.Sensitive"), + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=2, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="coupon_amount", + displayName=None, + dataType="DOUBLE", + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN=TagFQN(__root__="PII.Sensitive"), + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=3, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="credit_card_amount", + displayName=None, + dataType="DOUBLE", + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN="PersonalData.Personal", + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ), + TagLabel( + tagFQN=TagFQN(__root__="PII.Sensitive"), + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ), + ], + constraint=None, + ordinalPosition=4, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="FirstName", + displayName=None, + dataType="STRING", + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN=TagFQN(__root__="PII.NonSensitive"), + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=4, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), + Column( + name="is_customer", + displayName=None, + dataType="BOOLEAN", + arrayDataType=None, + dataLength=None, + precision=None, + scale=None, + dataTypeDisplay=None, + description="", + fullyQualifiedName=None, + tags=[ + TagLabel( + tagFQN="PersonalData.Personal", + description=None, + source="Tag", + labelType="Automated", + state="Suggested", + href=None, + ) + ], + constraint=None, + ordinalPosition=4, + jsonSchema=None, + children=None, + customMetrics=None, + profile=None, + ), +] + + +class PiiProcessorTest(TestCase): + """ + Run this integration test with different type of column name + to attach PII Tags + """ + + def __init__( + self, + methodName, + ) -> None: + super().__init__(methodName) + server_config = OpenMetadataConnection( + hostPort="http://localhost:8585/api", + authProvider="openmetadata", + securityConfig=OpenMetadataJWTClientConfig( + jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJh" + "bGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vc" + "mciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7Hgz" + "GBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUx" + "huv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakL" + "Lzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM" + "5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" + ), + ) + metadata = OpenMetadata(server_config) + self.processor = PiiProcessor(metadata_config=metadata) + + def test_process(self): + self.processor.process(MOCK_TABLE) + assert MOCK_TABLE.columns == EXPECTED_COLUMNS diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceMetadataPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceMetadataPipeline.json index 090c292d65c..e4a454a5a8b 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceMetadataPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceMetadataPipeline.json @@ -43,6 +43,11 @@ "type": "boolean", "default": true }, + "processPiiSensitive": { + "description": "Optional configuration to automatically tag columns that might contain sensitive information", + "type": "boolean", + "default": true + }, "useFqnForFiltering": { "description": "Regex will be applied on fully qualified name (e.g service_name.db_name.schema_name.table_name) instead of raw name (e.g. table_name)", "type": "boolean", diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx index 7054ca65537..99b1cbba02b 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/AddIngestion.component.tsx @@ -141,6 +141,7 @@ const AddIngestion = ({ data?.name ?? getIngestionName(serviceData.name, pipelineType), ingestSampleData: sourceConfig?.generateSampleData ?? true, useFqnFilter: sourceConfig?.useFqnForFiltering ?? false, + processPii: sourceConfig?.processPiiSensitive ?? false, databaseServiceNames: sourceConfig?.dbServiceNames ?? [], description: data?.description ?? '', repeatFrequency: @@ -327,6 +328,7 @@ const AddIngestion = ({ tableFilterPattern, topicFilterPattern, useFqnFilter, + processPii, } = state; switch (serviceCategory) { @@ -335,6 +337,7 @@ const AddIngestion = ({ useFqnForFiltering: useFqnFilter, includeViews: includeView, includeTags: includeTags, + processPiiSensitive: processPii, databaseFilterPattern: getFilterPatternData( databaseFilterPattern, showDatabaseFilter diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx index ebbba976849..5e8e9499691 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.test.tsx @@ -132,6 +132,6 @@ describe('Test ConfigureIngestion component', () => { expect(backButton).toBeInTheDocument(); expect(nextButton).toBeInTheDocument(); expect(filterPatternComponents).toHaveLength(3); - expect(toggleSwitchs).toHaveLength(5); + expect(toggleSwitchs).toHaveLength(6); }); }); diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx index dac23075a58..24a31cfd64f 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/Steps/ConfigureIngestion.tsx @@ -85,6 +85,7 @@ const ConfigureIngestion = ({ timeoutSeconds, topicFilterPattern, useFqnFilter, + processPii, } = useMemo( () => ({ chartFilterPattern: data.chartFilterPattern, @@ -121,6 +122,7 @@ const ConfigureIngestion = ({ timeoutSeconds: data.timeoutSeconds, topicFilterPattern: data.topicFilterPattern, useFqnFilter: data.useFqnFilter, + processPii: data.processPii, }), [data] ); @@ -185,6 +187,8 @@ const ConfigureIngestion = ({ const handleFqnFilter = () => toggleField('useFqnFilter'); + const handleProcessPii = () => toggleField('processPii'); + const handleQueryLogDuration = handleValueParseInt('queryLogDuration'); const handleResultLimit = handleValueParseInt('resultLimit'); @@ -455,6 +459,25 @@ const ConfigureIngestion = ({ ); }; + const getProcessPiiToggles = () => { + return ( + +
+ + +
+

+ {t('message.process-pii-sensitive-column-message')} +

+ {getSeparator('')} +
+ ); + }; + const getDashboardDBServiceName = () => { return ( @@ -527,6 +550,7 @@ const ConfigureIngestion = ({ {getFilterPatterns()} {getSeparator('')} {getFqnForFilteringToggles()} + {getProcessPiiToggles()} {getDatabaseFieldToggles()} ); diff --git a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts index 44c72120b68..aed1ee67177 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts +++ b/openmetadata-ui/src/main/resources/ui/src/components/AddIngestion/addIngestion.interface.ts @@ -133,6 +133,7 @@ export interface AddIngestionState { timeoutSeconds: number; topicFilterPattern: FilterPattern; useFqnFilter: boolean; + processPii: boolean; } export enum ShowFilter { diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json index 67d318ca02e..fdc6aff9e94 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json @@ -501,6 +501,7 @@ "primary-key": "Primary Key", "private-key": "PrivateKey", "private-key-id": "Private Key ID", + "process-pii-sensitive-column": "Auto Tag PII", "profile": "Profile", "profile-lowercase": "profile", "profile-sample-type": "Profile Sample {{type}}", @@ -956,6 +957,7 @@ "pipeline-description-message": "Description of the pipeline.", "pipeline-trigger-success-message": "Pipeline triggered successfully!", "pipeline-will-trigger-manually": "Pipeline will only be triggered manually.", + "process-pii-sensitive-column-message": "Check column names to auto tag PII Senstive/nonSensitive columns.", "profile-sample-percentage-message": "Set the Profiler value as percentage", "profile-sample-row-count-message": " Set the Profiler value as row count", "profiler-ingestion-description": "A profiler workflow can be configured and deployed after a metadata ingestion has been set up. Multiple profiler pipelines can be set up for the same database service. The pipeline feeds the Profiler tab of the Table entity, and also runs the tests configured for that entity. Add a Name, FQN, and define the filter pattern to start.",