Fix: Added changes for Pii sensitive (#10119)

* Fix: added changes for pii sensitive

* Fix: removed comments

* Fix: python checkstyle

* differtiate between sensitive and non sensitive tag

* fix: python test

* fix: added tests

* fix: maven CI
This commit is contained in:
NiharDoshi99 2023-02-08 21:30:47 +05:30 committed by GitHub
parent fe03e51cfe
commit 34a0cc147e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 663 additions and 3 deletions

View File

@ -52,6 +52,7 @@ from metadata.ingestion.source.database.database_service import (
DatabaseServiceSource,
SQLSourceStatus,
)
from metadata.ingestion.source.database.processor import PiiProcessor
from metadata.ingestion.source.database.sql_column_handler import SqlColumnHandlerMixin
from metadata.ingestion.source.database.sqlalchemy_source import SqlAlchemySource
from metadata.ingestion.source.models import TableView
@ -372,6 +373,12 @@ class CommonDbSourceService(
table_name=table_name
), # Pick tags from context info, if any
)
# Process pii sensitive column and append tags
if self.source_config.processPiiSensitive:
processor = PiiProcessor(metadata_config=self.metadata)
processor.process(table_request)
is_partitioned, partition_details = self.get_table_partition_details(
table_name=table_name, schema_name=schema_name, inspector=self.inspector
)

View File

@ -67,6 +67,7 @@ from metadata.ingestion.models.topology import (
create_source_context,
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.database.processor import PiiProcessor
from metadata.utils import fqn
from metadata.utils.filters import filter_by_schema
from metadata.utils.logger import ingestion_logger
@ -490,3 +491,10 @@ class DatabaseServiceSource(
)
yield from self.delete_schema_tables(schema_fqn)
def process_pii_sensitive_column(
self, metadata_config: OpenMetadata, table_request: CreateTableRequest
):
if self.source_config.processPiiSensitive:
processer = PiiProcessor(metadata_config=metadata_config)
processer.process(table_request=table_request)

View File

@ -432,6 +432,9 @@ class DatalakeSource(DatabaseServiceSource): # pylint: disable=too-many-public-
type="databaseSchema",
),
)
self.process_pii_sensitive_column(
metadata_config=self.metadata, table_request=table_request
)
yield table_request
self.register_record(table_request=table_request)
except Exception as exc:

View File

@ -272,7 +272,9 @@ class DeltalakeSource(DatabaseServiceSource):
),
viewDefinition=view_definition,
)
self.process_pii_sensitive_column(
metadata_config=self.metadata, table_request=table_request
)
yield table_request
self.register_record(table_request=table_request)

View File

@ -161,6 +161,9 @@ class DomodatabaseSource(DatabaseServiceSource):
type="databaseSchema",
),
)
self.process_pii_sensitive_column(
metadata_config=self.metadata, table_request=table_request
)
yield table_request
self.register_record(table_request=table_request)
except Exception as exc:

View File

@ -205,7 +205,9 @@ class DynamodbSource(DatabaseServiceSource):
type="databaseSchema",
),
)
self.process_pii_sensitive_column(
metadata_config=self.metadata, table_request=table_request
)
yield table_request
self.register_record(table_request=table_request)

View File

@ -287,6 +287,9 @@ class GlueSource(DatabaseServiceSource):
type="databaseSchema",
),
)
self.process_pii_sensitive_column(
metadata_config=self.metadata, table_request=table_request
)
yield table_request
self.register_record(table_request=table_request)
except Exception as exc:

View File

@ -0,0 +1,197 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor util to fetch pii sensitive columns
"""
import logging
import re
from abc import ABC, abstractmethod
from enum import Enum, auto
from typing import List, Optional
from commonregex import CommonRegex
from pydantic import BaseModel
from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.type.tagLabel import (
LabelType,
State,
TagLabel,
TagSource,
)
from metadata.ingestion.api.processor import Processor, ProcessorStatus
from metadata.ingestion.ometa.ometa_api import OpenMetadata, OpenMetadataConnection
PII = "PII"
class PiiTypes(Enum):
"""PiiTypes enumerates the different types of PII data"""
NONE = auto()
UNSUPPORTED = auto()
PHONE = auto()
EMAIL = auto()
CREDIT_CARD = auto()
ADDRESS = auto()
ADDRESS_LOCATION = auto()
PERSON = auto()
LOCATION = auto()
BIRTH_DATE = auto()
GENDER = auto()
NATIONALITY = auto()
IP_ADDRESS = auto()
SSN = auto()
USER_NAME = auto()
PASSWORD = auto()
ETHNICITY = auto()
TAX_ID = auto()
KEY = auto()
BANKACC = auto()
class TagType(Enum):
SENSITIVE = "Sensitive"
NONSENSITIVE = "NonSensitive"
class ColumnPIIType(BaseModel):
pii_types: PiiTypes
tag_type: TagType
class Scanner(ABC):
@abstractmethod
def scan(self, text):
"""scan the text and return array of PiiTypes that are found"""
class RegexScanner(Scanner):
"""A scanner that uses commmon regular expressions to find PII"""
def scan(self, text):
"""Scan the text and return an array of PiiTypes that are found"""
regex_result = CommonRegex(text)
types = []
if regex_result.phones: # pylint: disable=no-member
types.append(PiiTypes.PHONE.name)
if regex_result.emails: # pylint: disable=no-member
types.append(PiiTypes.EMAIL.name)
if regex_result.credit_cards: # pylint: disable=no-member
types.append(PiiTypes.CREDIT_CARD.name)
if regex_result.street_addresses: # pylint: disable=no-member
types.append(PiiTypes.ADDRESS.name)
return types
class ColumnNameScanner(Scanner):
"""
Column Name Scanner to scan column name
"""
sensitive_regex = {
PiiTypes.PASSWORD: re.compile("^.*password.*$", re.IGNORECASE),
PiiTypes.USER_NAME: re.compile("^.*user(id|name|).*$", re.IGNORECASE),
PiiTypes.KEY: re.compile("^.*(key).*$", re.IGNORECASE),
PiiTypes.SSN: re.compile("^.*(ssn|social).*$", re.IGNORECASE),
PiiTypes.CREDIT_CARD: re.compile("^.*(card).*$", re.IGNORECASE),
PiiTypes.BANKACC: re.compile("^.*(bank|acc|amount).*$", re.IGNORECASE),
PiiTypes.EMAIL: re.compile("^.*(email|e-mail|mail).*$", re.IGNORECASE),
}
non_sensitive_regex = {
PiiTypes.PERSON: re.compile(
"^.*(firstname|fname|lastname|lname|"
"fullname|maidenname|_name|"
"nickname|name_suffix|name).*$",
re.IGNORECASE,
),
PiiTypes.BIRTH_DATE: re.compile(
"^.*(date_of_birth|dateofbirth|dob|"
"birthday|date_of_death|dateofdeath).*$",
re.IGNORECASE,
),
PiiTypes.GENDER: re.compile("^.*(gender).*$", re.IGNORECASE),
PiiTypes.NATIONALITY: re.compile("^.*(nationality).*$", re.IGNORECASE),
PiiTypes.ADDRESS: re.compile(
"^.*(address|city|state|county|country|"
"zipcode|zip|postal|zone|borough).*$",
re.IGNORECASE,
),
PiiTypes.PHONE: re.compile("^.*(phone).*$", re.IGNORECASE),
}
def scan(self, text) -> Optional[List[ColumnPIIType]]:
types = set()
for pii_type_keys, pii_type_pattern in self.sensitive_regex.items():
if pii_type_pattern.match(text) is not None:
return ColumnPIIType(
pii_types=pii_type_keys, tag_type=TagType.SENSITIVE.value
)
for pii_type_keys, pii_type_pattern in self.non_sensitive_regex.items():
if pii_type_pattern.match(text) is not None:
return ColumnPIIType(
pii_types=pii_type_keys, tag_type=TagType.NONSENSITIVE.value
)
logging.debug("PiiTypes are %s", ",".join(str(x) for x in list(types)))
return None
class PiiProcessor(Processor):
"""
Processor class to process columns of table
"""
metadata_config: OpenMetadata
status: ProcessorStatus
metadata: OpenMetadata
def __init__(self, metadata_config: OpenMetadata):
super().__init__()
self.metadata = metadata_config
self.status = ProcessorStatus()
self.column_scanner = ColumnNameScanner()
@classmethod
def create(cls, config_dict: dict): # pylint: disable=arguments-differ
metadata_config = OpenMetadataConnection.parse_obj(config_dict)
return cls(metadata_config)
def process( # pylint: disable=arguments-differ
self, table_request: CreateTableRequest
) -> Optional[CreateTableRequest]:
for column in table_request.columns:
pii_tags = []
pii_tags: ColumnPIIType = self.column_scanner.scan(column.name.__root__)
tag_labels = []
if pii_tags:
tag_labels.append(
TagLabel(
tagFQN=f"{PII}.{pii_tags.tag_type.value}",
labelType=LabelType.Automated.value,
state=State.Suggested.value,
source=TagSource.Tag.value,
)
)
if len(tag_labels) > 0 and column.tags:
column.tags.extend(tag_labels)
elif len(tag_labels) > 0:
column.tags = tag_labels
self.status.records.append(column.name.__root__)
def close(self):
pass
def get_status(self) -> ProcessorStatus:
return self.status

View File

@ -206,6 +206,9 @@ class SalesforceSource(DatabaseServiceSource):
type="databaseSchema",
),
)
self.process_pii_sensitive_column(
metadata_config=self.metadata, table_request=table_request
)
yield table_request
self.register_record(table_request=table_request)

View File

@ -0,0 +1,397 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Test Processor Class
"""
from unittest import TestCase
from metadata.generated.schema.api.data.createTable import CreateTableRequest
from metadata.generated.schema.entity.data.table import Column, DataType, TableType
from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
OpenMetadataConnection,
)
from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (
OpenMetadataJWTClientConfig,
)
from metadata.generated.schema.type.entityReference import EntityReference
from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.database.processor import PiiProcessor
MOCK_TABLE: CreateTableRequest = CreateTableRequest(
name="DataSet Input",
displayName="DataSet Input",
description="this is a description for dataset input",
tableType=TableType.Regular.value,
columns=[
Column(
name="amount",
displayName=None,
dataType=DataType.DOUBLE.value,
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="This is description for amount",
fullyQualifiedName=None,
tags=None,
constraint=None,
ordinalPosition=1,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="bank_transfer_amount",
displayName=None,
dataType=DataType.DOUBLE.value,
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=None,
constraint=None,
ordinalPosition=2,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="coupon_amount",
displayName=None,
dataType=DataType.DOUBLE.value,
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=None,
constraint=None,
ordinalPosition=3,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="credit_card_amount",
displayName=None,
dataType=DataType.DOUBLE.value,
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN="PersonalData.Personal",
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=4,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="FirstName",
displayName=None,
dataType=DataType.STRING.value,
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=None,
constraint=None,
ordinalPosition=4,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="is_customer",
displayName=None,
dataType=DataType.BOOLEAN.value,
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN="PersonalData.Personal",
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=4,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
],
tableConstraints=None,
tablePartition=None,
tableProfilerConfig=None,
owner=None,
databaseSchema=EntityReference(
id="c3eb265f-5445-4ad3-ba5e-797d3a3071bb",
type="databaseSchema",
name=None,
fullyQualifiedName=None,
description=None,
displayName=None,
deleted=None,
href=None,
),
tags=None,
viewDefinition=None,
extension=None,
)
EXPECTED_COLUMNS = [
Column(
name="amount",
displayName=None,
dataType="DOUBLE",
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="This is description for amount",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.Sensitive"),
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=1,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="bank_transfer_amount",
displayName=None,
dataType="DOUBLE",
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.Sensitive"),
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=2,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="coupon_amount",
displayName=None,
dataType="DOUBLE",
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.Sensitive"),
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=3,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="credit_card_amount",
displayName=None,
dataType="DOUBLE",
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN="PersonalData.Personal",
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
),
TagLabel(
tagFQN=TagFQN(__root__="PII.Sensitive"),
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
),
],
constraint=None,
ordinalPosition=4,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="FirstName",
displayName=None,
dataType="STRING",
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN=TagFQN(__root__="PII.NonSensitive"),
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=4,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
Column(
name="is_customer",
displayName=None,
dataType="BOOLEAN",
arrayDataType=None,
dataLength=None,
precision=None,
scale=None,
dataTypeDisplay=None,
description="",
fullyQualifiedName=None,
tags=[
TagLabel(
tagFQN="PersonalData.Personal",
description=None,
source="Tag",
labelType="Automated",
state="Suggested",
href=None,
)
],
constraint=None,
ordinalPosition=4,
jsonSchema=None,
children=None,
customMetrics=None,
profile=None,
),
]
class PiiProcessorTest(TestCase):
"""
Run this integration test with different type of column name
to attach PII Tags
"""
def __init__(
self,
methodName,
) -> None:
super().__init__(methodName)
server_config = OpenMetadataConnection(
hostPort="http://localhost:8585/api",
authProvider="openmetadata",
securityConfig=OpenMetadataJWTClientConfig(
jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJh"
"bGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vc"
"mciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7Hgz"
"GBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUx"
"huv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakL"
"Lzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM"
"5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg"
),
)
metadata = OpenMetadata(server_config)
self.processor = PiiProcessor(metadata_config=metadata)
def test_process(self):
self.processor.process(MOCK_TABLE)
assert MOCK_TABLE.columns == EXPECTED_COLUMNS

View File

@ -43,6 +43,11 @@
"type": "boolean",
"default": true
},
"processPiiSensitive": {
"description": "Optional configuration to automatically tag columns that might contain sensitive information",
"type": "boolean",
"default": true
},
"useFqnForFiltering": {
"description": "Regex will be applied on fully qualified name (e.g service_name.db_name.schema_name.table_name) instead of raw name (e.g. table_name)",
"type": "boolean",

View File

@ -141,6 +141,7 @@ const AddIngestion = ({
data?.name ?? getIngestionName(serviceData.name, pipelineType),
ingestSampleData: sourceConfig?.generateSampleData ?? true,
useFqnFilter: sourceConfig?.useFqnForFiltering ?? false,
processPii: sourceConfig?.processPiiSensitive ?? false,
databaseServiceNames: sourceConfig?.dbServiceNames ?? [],
description: data?.description ?? '',
repeatFrequency:
@ -327,6 +328,7 @@ const AddIngestion = ({
tableFilterPattern,
topicFilterPattern,
useFqnFilter,
processPii,
} = state;
switch (serviceCategory) {
@ -335,6 +337,7 @@ const AddIngestion = ({
useFqnForFiltering: useFqnFilter,
includeViews: includeView,
includeTags: includeTags,
processPiiSensitive: processPii,
databaseFilterPattern: getFilterPatternData(
databaseFilterPattern,
showDatabaseFilter

View File

@ -132,6 +132,6 @@ describe('Test ConfigureIngestion component', () => {
expect(backButton).toBeInTheDocument();
expect(nextButton).toBeInTheDocument();
expect(filterPatternComponents).toHaveLength(3);
expect(toggleSwitchs).toHaveLength(5);
expect(toggleSwitchs).toHaveLength(6);
});
});

View File

@ -85,6 +85,7 @@ const ConfigureIngestion = ({
timeoutSeconds,
topicFilterPattern,
useFqnFilter,
processPii,
} = useMemo(
() => ({
chartFilterPattern: data.chartFilterPattern,
@ -121,6 +122,7 @@ const ConfigureIngestion = ({
timeoutSeconds: data.timeoutSeconds,
topicFilterPattern: data.topicFilterPattern,
useFqnFilter: data.useFqnFilter,
processPii: data.processPii,
}),
[data]
);
@ -185,6 +187,8 @@ const ConfigureIngestion = ({
const handleFqnFilter = () => toggleField('useFqnFilter');
const handleProcessPii = () => toggleField('processPii');
const handleQueryLogDuration = handleValueParseInt('queryLogDuration');
const handleResultLimit = handleValueParseInt('resultLimit');
@ -455,6 +459,25 @@ const ConfigureIngestion = ({
);
};
const getProcessPiiToggles = () => {
return (
<Field>
<div className="tw-flex tw-gap-1">
<label>{t('label.process-pii-sensitive-column')}</label>
<ToggleSwitchV1
checked={processPii}
handleCheck={handleProcessPii}
testId="include-lineage"
/>
</div>
<p className="tw-text-grey-muted tw-mt-3">
{t('message.process-pii-sensitive-column-message')}
</p>
{getSeparator('')}
</Field>
);
};
const getDashboardDBServiceName = () => {
return (
<Field>
@ -527,6 +550,7 @@ const ConfigureIngestion = ({
{getFilterPatterns()}
{getSeparator('')}
{getFqnForFilteringToggles()}
{getProcessPiiToggles()}
{getDatabaseFieldToggles()}
</Fragment>
);

View File

@ -133,6 +133,7 @@ export interface AddIngestionState {
timeoutSeconds: number;
topicFilterPattern: FilterPattern;
useFqnFilter: boolean;
processPii: boolean;
}
export enum ShowFilter {

View File

@ -501,6 +501,7 @@
"primary-key": "Primary Key",
"private-key": "PrivateKey",
"private-key-id": "Private Key ID",
"process-pii-sensitive-column": "Auto Tag PII",
"profile": "Profile",
"profile-lowercase": "profile",
"profile-sample-type": "Profile Sample {{type}}",
@ -956,6 +957,7 @@
"pipeline-description-message": "Description of the pipeline.",
"pipeline-trigger-success-message": "Pipeline triggered successfully!",
"pipeline-will-trigger-manually": "Pipeline will only be triggered manually.",
"process-pii-sensitive-column-message": "Check column names to auto tag PII Senstive/nonSensitive columns.",
"profile-sample-percentage-message": "Set the Profiler value as percentage",
"profile-sample-row-count-message": " Set the Profiler value as row count",
"profiler-ingestion-description": "A profiler workflow can be configured and deployed after a metadata ingestion has been set up. Multiple profiler pipelines can be set up for the same database service. The pipeline feeds the Profiler tab of the Table entity, and also runs the tests configured for that entity. Add a Name, FQN, and define the filter pattern to start.",